thrust issue? please help me! someone familiar with thrust.

Hi,dear all

I use thrust:sort() ,get error ,such as

First-chance exception at 0x000007FEFD6BA06D in xxx.exe: Microsoft C++ exception: thrust::system::system_error at memory location 0x000000000021D600.

Unhandled exception at 0x000007FEFD6BA06D in xxx.exe: Microsoft C++ exception: thrust::system::system_error at memory location 0x000000000021D600.

invalid device pointer
{device_radix_sort_dispatch.cuh}

Internal debugger error occurred while attempting to launch _ZN6thrust6system4cuda6detail4cub_30DeviceRadixSortDownsweepKernelINS3_23DeviceRadixSortDispatchILb0EjNS3_8NullTypeEiE18PtxDownsweepPolicyELb0EjS6_iEEvPT1_SA_PT2_SC_PT3_SD_iibbNS3_13GridEvenShareISD_EE in
CUcontext 0x0a5cab00, CUmodule 0xa96699a0:
code patching failed for unknown reason.

All breakpoints for function _ZN6thrust6system4cuda6detail4cub_30DeviceRadixSortDownsweepKernelINS3_23DeviceRadixSortDispatchILb0EjNS3_8NullTypeEiE18PtxDownsweepPolicyELb0EjS6_iEEvPT1_SA_PT2_SC_PT3_SD_iibbNS3_13GridEvenShareISD_EE have been removed.

See Output View for additional messages of this type.

my env is:

NVIDIA Nsight Visual Studio Edition Version 5.2
{Build Number 5.2.0.16321}
CUDA8.0 >{CUDA Toolkit 7.5 or 8.0}
GTX1080
Visual Studio 2013
Driver{376.51}>{NVIDIA Display Driver version 376.09}

codes as:
thrust::device_vector bucket_indices(nLevelSize);
xxx…
thrust::sort(bucket_indices.begin(), bucket_indices.end());

can anyone tell me how i can resolve it?
thrust bug?
nsight bug?
my code bug?

first exec is ok.
second exec is error. such as for 1:10

thanks very much!

Hello,Dear harryz_

first,thanks very much for your reply.

I haved {1} disable the TDR in nsight monitor and {2} disable the nsight memcheck,
but this error also occur.
so sure that the error caused by a memory leak in my own code.

I am confused that
first time exec is ok {n=0}
second time exec is error {n=1} inside a for loop.

for (int n = 0;n<3;n++)
{
my code exec;
}

why this case occur?
how i can quick fix this memory leakage error?

hope that your helps!thanks very much!

this is my mini sample code, thanks very much!

my env is:

NVIDIA Nsight Visual Studio Edition Version 5.2
{Build Number 5.2.0.16321}
CUDA8.0 >{CUDA Toolkit 7.5 or 8.0}
GTX1080
Visual Studio 2013
Driver{376.51}>{NVIDIA Display Driver version 376.09}

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

#include

#include <cuda_runtime.h>

#include <helper_functions.h>
#include <helper_cuda.h>

#include “device_launch_parameters.h”

using namespace std;

// thrust
#include <thrust/host_vector.h> // host vector
#include <thrust/device_vector.h> // device vector
#include <thrust/device_ptr.h> // device ptr
#include <thrust/iterator/counting_iterator.h> // iterator
#include <thrust/transform.h>
#include <thrust/sort.h>
#include <thrust/reduce.h>
#include <thrust/generate.h>
#include <thrust/binary_search.h>
#include <thrust/random.h>
#include <thrust/functional.h>

struct point_to_bucket_index : public thrust::unary_function<float2, unsigned int>
{
// constructor function
host device point_to_bucket_index(unsigned int width, unsigned int height) :w(width), h(height){}

host device unsigned int operator()(float2 p) const
{
// find the raster indices of p’s bucket
unsigned int x = static_cast(p.x * (w - 1));
unsigned int y = static_cast(p.y * (h - 1));

// return the bucket’s linear index
return y * w + x;
}

// member variable
unsigned int w, h;
};

global void TransToFloat2(const float* d_Input1, const float* d_Input2, float2* d_Output, const int nSize, const int nBlocksNumX)
{
const int tid = (blockIdx.y * nBlocksNumX + blockIdx.x) * blockDim.x + threadIdx.x;

if (tid < nSize)
{
d_Output[tid] = make_float2(d_Input1[tid], d_Input2[tid]);
}

}

void MiniLevelCompute(const int nLevel)
{
int nMaxIterNum = 15;

int nWidth = 512;
int nHeight = 512;
int nSlice = 32;

int nLevelWidth = int((nWidth - 1) / pow(2, nLevel)) + 1;
int nLevelHeight = int((nHeight - 1) / pow(2, nLevel)) + 1;
int nLevelSlice = int((nSlice - 1) / pow(2, nLevel)) + 1;

int nLevelSize = nLevelWidth * nLevelHeight * nLevelSlice;

int nDataSizeByte = nLevelSize * sizeof(float);

float* d_ImageDataRef_tmp;
float* d_ImageDataMov_tmp;

checkCudaErrors(cudaMalloc((void**)&d_ImageDataRef_tmp, nDataSizeByte));
checkCudaErrors(cudaMalloc((void**)&d_ImageDataMov_tmp, nDataSizeByte));

checkCudaErrors(cudaMemset(d_ImageDataRef_tmp, 0, nDataSizeByte));
checkCudaErrors(cudaMemset(d_ImageDataMov_tmp, 0, nDataSizeByte));

int nHistBinsNum = 150;

thrust::device_vector bucket_begin(nHistBinsNum * nHistBinsNum);
thrust::device_vector bucket_end(nHistBinsNum * nHistBinsNum);
thrust::device_vector bucket_indices(nLevelSize);
thrust::device_vector bucket_sizes(nHistBinsNum * nHistBinsNum);

float2 d_points;
cudaMalloc((void
*)&d_points, sizeof(float2)* nLevelSize);

int nBlocksNumX = 1024;
int nThreadsNumPerBlock = 256;

dim3 nblocks;
nblocks.x = nBlocksNumX;
nblocks.y = ((1 + (nLevelSize - 1) / nThreadsNumPerBlock) - 1) / nBlocksNumX + 1;

for (int nIter = 0; nIter < nMaxIterNum; nIter++)
{

TransToFloat2 << <nblocks, nThreadsNumPerBlock >> >(d_ImageDataRef_tmp, d_ImageDataMov_tmp, d_points, nLevelSize, nBlocksNumX);

thrust::device_ptr points_t(d_points);

thrust::transform(points_t, points_t + nLevelSize, bucket_indices.begin(), point_to_bucket_index(nHistBinsNum, nHistBinsNum));

cout << "debug Iter : " << nIter << endl;

thrust::sort(bucket_indices.begin(), bucket_indices.end());

cout << "debug Iter : " << nIter << endl;
}

cudaFree(d_points);
cudaFree(d_ImageDataMov_tmp);
cudaFree(d_ImageDataMov_tmp);

}

int test_CUDA_error()
{
int nDevice = 0;

cudaSetDevice(nDevice);

int nDeviceCount;

cudaDeviceProp cDeviceProp;

cudaGetDeviceCount(&nDeviceCount);

cudaGetDeviceProperties(&cDeviceProp, nDevice);

if (1)
{
cout << "Using device # " << nDevice << endl;

cout << "Max threads per block: " << cDeviceProp.maxThreadsPerBlock << endl;

cout << "Max Threads DIM: " << cDeviceProp.maxThreadsDim[0] << " x " << cDeviceProp.maxThreadsDim[1] << " x " << cDeviceProp.maxThreadsDim[2] << endl;

cout << "Max Grid Size: " << cDeviceProp.maxGridSize[0] << " x " << cDeviceProp.maxGridSize[1] << " x " << cDeviceProp.maxGridSize[2] << endl;

printf(“Device %d: "%s" with Compute %d.%d capability\n”, nDevice, cDeviceProp.name, cDeviceProp.major, cDeviceProp.minor);
}

for (int nLevel = 1; nLevel >= 0; nLevel–)
{
MiniLevelCompute(nLevel);
}

return 0;
}

I can repo it on my 1070 + cuda 9.2, your app crashes at line 92 thrust::device_vector bucket_begin(nHistBinsNum * nHistBinsNum); in the second level, it causes gpu exception, so the nsight doesn’t work, it even doesn’t work without nsight. I’m not familiar with thrust programming, I think you should post your question at cuda programming forum.

You shouldn’t do this:

cudaFree(d_ImageDataMov_tmp);
cudaFree(d_ImageDataMov_tmp);

doing a free operation on that pointer twice will throw an error on the 2nd call.

Hello,dear txbob

you are great!

comment code patches
{
//cudaFree(d_points);
//cudaFree(d_ImageDataMov_tmp);
//cudaFree(d_ImageDataMov_tmp);
}
can avoid this exception.

but i want to know that the memory leakage can occur such as cpu?
how to avoid memory leakage?

thanks very much!

Don’t just comment those out. That will create a memory leak. Instead just free each pointer once:

cudaFree(d_points);
cudaFree(d_ImageDataRef_tmp);
cudaFree(d_ImageDataMov_tmp);

(exactly the way you would with cpu code to avoid a memory leak)

OK,thank you very much!