cuda thrust::sort_by_key system error

Hi everyone,

//here is code
thrust::device_ptr dev_data_ptr(d_out);
thrust::device_ptr dev_keys_ptr(d_indx);

cudaDeviceSynchronize();
thrust::sort_by_key(dev_data_ptr, dev_data_ptr + size, dev_keys_ptr);
the d_out and d_indx value
output**** 0 “0”
output**** 1 “4”
output**** 2 “4”
output**** 3 “0”
output**** 4 “4”
output**** 5 “20”
output**** 6 “17”
output**** 7 “63”
h_indx**** 0 “0”
h_indx**** 1 “1”
h_indx**** 2 “2”
h_indx**** 3 “3”
h_indx**** 4 “4”
h_indx**** 5 “5”
h_indx**** 6 “6”
h_indx**** 7 “7”
and i got the error cuda system error
IsolationDeleteExe.exe 中的 0x7527a932 (KernelBase.dll) 处有未经处理的异常: Microsoft C++ 异常: 内存位置 0x0097e7ec 处的 thrust::system::system_error。
////////stack
msvcr100.dll!_CxxThrowException(void * pExceptionObject, const s__ThrowInfo * pThrowInfo) 行 157 C++
IsolationDeleteExe.exe!thrust::system::cuda::detail::bulk
::detail::throw_on_error(cudaError e, const char * message) 行 49 C++

IsolationDeleteExe.exe!thrust::system::cuda::detail::throw_on_error(cudaError error, const char * message) 行 37 + 0xd 字节 C++
IsolationDeleteExe.exe!thrust::system::cuda::detail::detail::stable_radix_sort_detail::stable_radix_sort_by_key_n<thrust::system::cuda::detail::tag,int,int,thrust::less >(thrust::system::cuda::detail::execution_policythrust::system::cuda::detail::tag & exec, int * first1, unsigned int n, int * first2, thrust::less comp) 行 452 + 0x87 字节 C++
IsolationDeleteExe.exe!thrust::system::cuda::detail::detail::stable_radix_sort_by_key<thrust::system::cuda::detail::tag,thrust::device_ptr,thrust::device_ptr >(thrust::system::cuda::detail::execution_policythrust::system::cuda::detail::tag & exec, thrust::device_ptr first1, thrust::device_ptr last1, thrust::device_ptr first2, thrust::less comp) 行 494 + 0x67 字节 C++
IsolationDeleteExe.exe!thrust::system::cuda::detail::detail::stable_primitive_sort_detail::stable_primitive_sort_by_key<thrust::system::cuda::detail::tag,thrust::device_ptr,thrust::device_ptr,thrust::less >(thrust::system::cuda::detail::execution_policythrust::system::cuda::detail::tag & exec, thrust::device_ptr keys_first, thrust::device_ptr keys_last, thrust::device_ptr values_first, thrust::less comp) 行 182 + 0x1d 字节 C++
IsolationDeleteExe.exe!thrust::system::cuda::detail::detail::stable_primitive_sort_by_key<thrust::system::cuda::detail::tag,thrust::device_ptr,thrust::device_ptr >(thrust::system::cuda::detail::execution_policythrust::system::cuda::detail::tag & exec, thrust::device_ptr keys_first, thrust::device_ptr keys_last, thrust::device_ptr values_first, thrust::less comp) 行 224 + 0x1d 字节 C++
IsolationDeleteExe.exe!thrust::system::cuda::detail::stable_sort_detail::stable_sort_by_key<thrust::system::cuda::detail::tag,thrust::device_ptr,thrust::device_ptr,thrust::less >(thrust::system::cuda::detail::execution_policythrust::system::cuda::detail::tag & exec, thrust::device_ptr keys_first, thrust::device_ptr keys_last, thrust::device_ptr values_first, thrust::less comp) 行 152 + 0x83 字节 C++
IsolationDeleteExe.exe!thrust::system::cuda::detail::stable_sort_by_key<thrust::system::cuda::detail::tag,thrust::device_ptr<int>,thrust::device_ptr<int>,thrust::less<int> >'::2’::workaround::parallel_path(thrust::system::cuda::detail::execution_policythrust::system::cuda::detail::tag & exec, thrust::device_ptr keys_first, thrust::device_ptr keys_last, thrust::device_ptr values_first, thrust::less comp) 行 259 + 0x1d 字节 C++
IsolationDeleteExe.exe!thrust::system::cuda::detail::stable_sort_by_key<thrust::system::cuda::detail::tag,thrust::device_ptr,thrust::device_ptr,thrust::less >(thrust::system::cuda::detail::execution_policythrust::system::cuda::detail::tag & exec, thrust::device_ptr keys_first, thrust::device_ptr keys_last, thrust::device_ptr values_first, thrust::less comp) 行 274 + 0x1a 字节 C++
IsolationDeleteExe.exe!thrust::stable_sort_by_key<thrust::system::cuda::detail::tag,thrust::device_ptr,thrust::device_ptr,thrust::less >(const thrust::detail::execution_policy_basethrust::system::cuda::detail::tag & exec, thrust::device_ptr keys_first, thrust::device_ptr keys_last, thrust::device_ptr values_first, thrust::less comp) 行 145 + 0x2f 字节 C++
IsolationDeleteExe.exe!thrust::system::detail::generic::sort_by_key<thrust::system::cuda::detail::tag,thrust::device_ptr,thrust::device_ptr,thrust::less >(thrust::execution_policythrust::system::cuda::detail::tag & exec, thrust::device_ptr keys_first, thrust::device_ptr keys_last, thrust::device_ptr values_first, thrust::less comp) 行 91 + 0x1d 字节 C++
IsolationDeleteExe.exe!thrust::sort_by_key<thrust::system::cuda::detail::tag,thrust::device_ptr,thrust::device_ptr,thrust::less >(const thrust::detail::execution_policy_basethrust::system::cuda::detail::tag & exec, thrust::device_ptr keys_first, thrust::device_ptr keys_last, thrust::device_ptr values_first, thrust::less comp) 行 113 + 0x2f 字节 C++
IsolationDeleteExe.exe!thrust::system::detail::generic::sort_by_key<thrust::system::cuda::detail::tag,thrust::device_ptr,thrust::device_ptr >(thrust::execution_policythrust::system::cuda::detail::tag & exec, thrust::device_ptr keys_first, thrust::device_ptr keys_last, thrust::device_ptr values_first) 行 75 + 0x23 字节 C++
IsolationDeleteExe.exe!thrust::sort_by_key<thrust::system::cuda::detail::tag,thrust::device_ptr,thrust::device_ptr >(const thrust::detail::execution_policy_basethrust::system::cuda::detail::tag & exec, thrust::device_ptr keys_first, thrust::device_ptr keys_last, thrust::device_ptr values_first) 行 96 + 0x2a 字节 C++

///////////////////////////////////

Hello,

Thanks for the report. I’d need a bit more information to act on this. Please provide:

  • A minimal test case; e.g. a complete, standalone program that is as small as possible and has few/no dependencies on other software.
  • The entire sequence of commands/actions that reproduce the bug (using the minimal test case).
  • The unabbreviated outputs of those commands/actions including files, stdout/stderr and logs.
  • A description of the environment where the problem occurs (type and version of the hardware, OS, kernel, compiler and any other relevant software).

– Bryce

//##########################
//int sizep = GetPoints.size() ;
int sizep = 8;
//****************************************
double *x_cord = (double ) malloc(sizep sizeof(double));
memset((void *)x_cord, 0, sizeof(double)*sizep);

double *y_cord = (double *) malloc(sizep* sizeof(double));
memset((void *)y_cord, 0, sizeof(double)*sizep);

double *z_cord = (double *) malloc(sizep* sizeof(double));
memset((void *)z_cord, 0, sizeof(double)*sizep);
///////////////////////
int *outdata = (int *) malloc(sizep * sizeof(int));
memset((void *)outdata, 2, sizeof(int)*sizep);
// Before calculating the cumulative bandwidth, initialize bandwidths array to NULL
minx=0; maxx=4; miny=0;  maxy=4;  minz=0;  maxz=4;
int dimx = (maxx - minx)/1;
int dimy = (maxy - miny)/1;
int dimz = (maxz - minz)/1;
int idxgrid = dimx*dimy*dimz;
int * h_end  = NULL;
h_end = (int *) malloc(idxgrid * sizeof(int));
memset((void *)h_end, 0, sizeof(int)*idxgrid);
/////////////////////////////////////

int *h_begin = (int *) malloc(idxgrid * sizeof(int));
memset((void *)h_begin, 0, sizeof(int)*idxgrid);
int *hindx = (int *) malloc(sizep * sizeof(int));
memset((void *)hindx, 1, sizeof(int)*sizep);
//int i=0;
//for(map<int,SplitCloud::PointCloud>::iterator iter = GetPoints.begin(); iter!=GetPoints.end(); ++iter,i++)
//{  
//	
//	x_cord[i]= iter->second.x;
//	y_cord[i]= iter->second.y;
//	z_cord[i]= iter->second.z;

//	//tmpPoints.push_back(iter->second);
//}

	x_cord[0]= 0;
	y_cord[0]= 1;
	z_cord[0]= 2;

	x_cord[1]= 0;
	y_cord[1]= 1;
	z_cord[1]= 3;

	x_cord[2]= 0;
	y_cord[2]= 1;
	z_cord[2]= 4;

	x_cord[3]= 0;
	y_cord[3]=4;
	z_cord[3]= 0;

	x_cord[4]= 0;
	y_cord[4]= 1;
	z_cord[4]= 0;

	x_cord[5]= 0;
	y_cord[5]= 5;
	z_cord[5]= 1;

	x_cord[6]= 1;
	y_cord[6]= 0;
	z_cord[6]= 1;

	x_cord[7]=3;
	y_cord[7]= 3;
	z_cord[7]= 3;

	
	
int ineighour=26;
int *gridnum = (int *) malloc(ineighour * sizeof(int));
memset((void *)gridnum, 0, sizeof(int)*ineighour);
test( x_cord,y_cord,z_cord,outdata,sizep,  minx,  maxx,  miny,  maxy,  minz,  maxz,h_begin,h_end,hindx);

extern “C”
void test(double *x,double *y,double *z, int * out, int size, double minx, double maxx, double miny, double maxy, double minz, double maxz,int * totalgrid_begin,int * totalgrid_end,int *h_indx)
{
int disd = 1;
double * d_datax=NULL;
double * d_datay=NULL;
double * d_dataz=NULL;
int *d_begin = NULL;
int * d_end = NULL;
int *d_indx = NULL;
int d_out = NULL;
int d_gridnm= NULL;
int dimz = (maxz - minz)/disd;
int dimy = (maxy - miny)/disd;
int dimx = (maxx - minx)/disd;
int idxgrid = dimx
dimy
dimz;

int ineighour=26;
int *gridnum = (int *) malloc(ineighour * sizeof(int));
memset((void *)gridnum, 0, sizeof(int)*ineighour);

cudaError_t cudaStatus;
 cudaStatus = cudaSetDevice(0);

///////////////begin end 
cudaStatus = cudaMalloc((void**)&d_begin, idxgrid * sizeof(int));
if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMalloc failed!");
}
cudaStatus = cudaMalloc((void**)&d_end, idxgrid * sizeof(int));
if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMalloc failed!");
}
cudaMemset((void *)&d_begin, 0, sizeof(int));
cudaMemset((void *)&d_end, 0, sizeof(int));
cudaStatus = cudaMemcpy(d_begin, totalgrid_begin, idxgrid * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMemcpy failed!");
	
}
cudaStatus = cudaMemcpy(d_end, totalgrid_end, idxgrid * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMemcpy failed!");
	
}

// Choose which GPU to run on, change this on a multi-GPU system.
//cudaStatus = cudaSetDevice(0);
//// xy z 
cudaStatus = cudaMalloc((void**)&d_datax, size * sizeof(double));
if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMalloc failed!");
}
cudaStatus = cudaMalloc((void**)&d_datay, size * sizeof(double));
if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMalloc failed!");
}
cudaStatus = cudaMalloc((void**)&d_dataz, size * sizeof(double));
if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMalloc failed!");
}
//cudaMemset(d_indx, 0, sizeof(int)*size);
cudaStatus = cudaMemcpy(d_datax, x, size * sizeof(double), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMemcpy failed!");
	
}
cudaStatus = cudaMemcpy(d_datay, y, size * sizeof(double), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMemcpy failed!");
	
}
cudaStatus = cudaMemcpy(d_dataz, z, size * sizeof(double), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMemcpy failed!");
	
}
////////////////////////indx


cudaStatus = cudaMalloc((void**)&d_indx, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMalloc failed!");
}

cudaStatus = cudaMemcpy(d_indx, h_indx, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMemcpy failed!");
	
}
///////////////////grid num
cudaStatus = cudaMalloc((void**)&d_gridnm, ineighour * sizeof(int));
if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMalloc failed!");

}
cudaStatus = cudaMemcpy(d_gridnm, gridnum, ineighour * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMemcpy failed!");
	
}
//////////////////////out/////////////////////////
cudaStatus = cudaMalloc((void**)&d_out, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMalloc failed!");
}
cudaStatus = cudaMemcpy(d_out, out, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMemcpy failed!");
	
}
/////////////////////////////
// Launch a kernel on the GPU with one thread for each element.,  minx,  maxx,  miny,  maxy,  minz,maxz
int threadsPerBlock = 8;
int blocksPerGrid =(size + threadsPerBlock - 1) / threadsPerBlock;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
dim3 cudaBlockSize(threadsPerBlock,1,1);
dim3 cudaGridSize(1, 1, 1);

grid_cau<<<cudaGridSize,cudaBlockSize>>> (d_datax,d_datay,d_dataz, d_out,  size,minx,  maxx,  miny,  maxy,  minz,maxz,d_indx);



thrust::device_ptr<int> dev_data_ptr(d_out);
thrust::device_ptr<int> dev_keys_ptr(d_indx);


thrust::sort_by_key(dev_data_ptr, dev_data_ptr+size, dev_keys_ptr);


//search_start<<<cudaBlockSize,cudaGridSize>>>(d_out,d_begin,d_end,size);

//cudaMemset((void *)d_out, 0, sizeof(int)*size);

//caul_dis<<<cudaBlockSize,cudaGridSize>>>(d_datax,d_datay,d_dataz, d_out,d_indx,d_begin,d_end,size,minx,  maxx,  miny,  maxy,  minz,maxz,d_gridnm);


cudaStatus = cudaMemcpy(out, d_out, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMemcpy failed!");	     
}
//bool bTestResult = thrust::is_sorted(out, out+size-1);

cudaStatus = cudaMemcpy(h_indx, d_indx, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMemcpy failed!");	     
}
cudaStatus = cudaMemcpy(totalgrid_begin, d_begin, idxgrid * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMemcpy failed!");	     
}
cudaStatus = cudaMemcpy(totalgrid_end, d_end, idxgrid * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
	fprintf(stderr, "cudaMemcpy failed!");	     
}

cout<< stderr<<endl;   
cudaFree(d_out);
cudaFree(d_datax);
cudaFree(d_datay);
cudaFree(d_dataz);

cudaFree(d_begin);
cudaFree(d_end);
cudaFree(d_indx);
//free(totalgrid_end);

}

Hey,

Sorry, but that’s not much better. I need a full example - e.g. something I can copy and paste into a text editor and then compile on my end.

Could you put the code on gist.github.com or a similar service if it won’t fit in a post?

this is the .cu file thank you

cu.cu (12.5 KB)

this is the .cu file

Hey,

What version of CUDA are you using? I think this is fixed with CUDA 9.0.