Below CUDA program segment gives “segmentation fault” error
I am allocating memory using CUDA 4.0, Centos 64 bits and a GTX 480. I am not sure where it is failing, I think a cudaMalloc makes my program crashing. I am making pointers of pointers using CUDA on device memory. In this code I am executing concurrent kernels, so a stream is created per kernel.
I just commented out the kernel call, but still I am getting same “Segmentation fault”. Please have a look into below program and give me a hint to overcome this issue.
float randomNumberKernel(int num_data, int num_kernels, int num_blocks) {
mtgp32_kernel_status_t *d_status;
uint32_t **d_data = (uint32_t **) malloc(sizeof(uint32_t) * num_kernels);
uint32_t **h_data = (uint32_t **) malloc(sizeof(uint32_t) * num_kernels);
cudaError_t e;
float gputime;
cudaEvent_t start;
cudaEvent_t end;
cudaStream_t *streams;
streams = (cudaStream_t *) malloc(sizeof(cudaStream_t) * num_kernels);
for (int i = 0; i < num_kernels; ++i)
for (int i = 0; i < num_kernels; ++i)
cudaMalloc((void**)&d_data[i], sizeof(uint32_t) * num_data/num_kernels);
for (int i = 0; i < num_kernels; ++i)
cudaHostAlloc( (void**)&h_data[i], (num_data/num_kernels) * sizeof( uint32_t ), cudaHostAllocWriteCombined );
ccudaEventRecord(start, 0);
/* kernel calls */
for (int i = 0; i < num_kernels; ++i){
mtgp32_uint32_kernel<<< num_blocks, THREAD_NUM,0, streams[i]>>>(d_status, d_data[i], num_data / (num_kernels*num_blocks),i*num_blocks);
e = cudaGetLastError();
if (e != cudaSuccess) {
printf("failure in kernel call.\n%s\n", cudaGetErrorString(e));
for(int i = 0; i < num_kernels; i++) {
cudaStreamSynchronize( streams[i] ) ;
cudaEventRecord(end, 0);
cudaEventElapsedTime(&gputime, start, end);
// release resources
for(int i = 0; i < num_kernels; i++) {
for(int i = 0; i < num_kernels; i++) {
cudaFreeHost( h_data[i] ); cudaFree(d_data[i]);
/* ccutDeleteTimer(timer); */
return gputime;