Hi Nvidia:
I am trying to use pthread and streams to do parallel calculation.
I used UMA (cudaMallocManaged) to get memories, then sent them into CUDA kernels.
After finishing kernels, when CPU is trying to access UMA allocated memories, segmentation fault is rasied.
But if I run the program in single thread (with or without pthread were the same), the program will successfully executed.
Please help, many thanks.
Platform:
Jetson AGX Orin Dev. Kit
Jetpack 5.0.2
CUDA version: cuda-11.4
CUDA-GDB Version (/usr/local/cuda-11.4/bin/cuda-gdb --version):
11.4 release
GNU gdb(GDB) 10.4
Code flow is shown below:
main.cpp
typedef struct{
double *imgSrc;
double *rslt;
int streamNum;
}calculateData_t ;
int main()
{
// Init memories
cudaMallocManaged(imgSrc1 , size_for_img );
cudaMallocManaged(imgSrc2 , size_for_img );
cudaMallocManaged(rslt1 , size_for_rslt );
cudaMallocManaged(rslt2 , size_for_rslt );
// Create Streams
int nstreams = 2;
streams = (cudaStream_t *) malloc(nstreams * sizeof(cudaStream_t));
for (int i = 0; i < nstreams; i++ )
cudaStreamCreate(& (streams[i]) );
calculateData_t data1, data2;
data1.imgSrc = imgSrc1;
data1.rslt = rslt1;
data1.streamNum = 0;
data2.imgSrc = imxSrc2;
data2.rslt = rslt2;
data2.streamNum = 1;
// Create Threads
pthread_create(&thread1,NULL,calFunctions,&data1);
pthread_create(&thread2,NULL,calFunctions,&data2);
pthread_join(thread1, NULL);
pthread_join(thread2, NULL);
// Free memories and distroy streams
...
}
void *calFunctions(void *arg)
{
calculateData_t *d = (calculateData_t *) arg;
double *img = d->imgSrc;
double *rslt = d->rslt;
int streamNum = d->streamNum;
// To taskDispatcher.cu
calFunctionsDispatch(imgSrc, rslt, streamNum);
pthread_exit(NULL);
}
taskDispatcher.cu:
extern "C" void calFunctionsDispatch(double *img, double *rslt, int streamNum)
{
// Calculate grid dim and block dim
...
// Start kernel in kernels.cu
ColSum_16d_parallel<<< griddim, blkdim , 0 ,streams[streamNum]>>>(img, rslt);
// Wait stream finishing its job
cudaStreamSynchronize(streams[streamNum]);
// Do CPU calculation
double rslt_value[8] = {};
for( int i = 0 ; i < 8 ; i ++)
{
// Segmentation Fault Rasied
rslt_value[0] += rslt[i];
}
// print out rslt_value
...
return;
}
kernels.cu
__global__ void ColSum_16d_parallel(const double *img, double *rslt)
{
int bid = blockIdx.x;
int tid = threadIdx.x;
double rslt_value = 0.0;
int rsltIdx = bid * colLength + tid;
// if data_within_range --> add cols value into rslt;
if (DATA_WITHIN_RANGE)
{
for (int i = 0 ; i < colLength ; i++ )
{
rslt_value += src1[i];
}
rslt[rsltIdx] = rslt_value;
}
}
Compiling Code:
nvcc -G -g -Xcompiler "-fPIC" taskDispatcher.cu -c -o build/taskDispatcher.o
g++ -c -g main.cpp -o build/main.o
g++ -o colSum build/main.o build/taskDispatcher.o -lcudart -L/usr/local/cuda/lib64 -lpthread