Hi,
I am trying to develop a simple CUDA application using NSight on Ubuntu 16.04LTS. I am working with a persistent kernel which is mean to process incoming coordinates. It is based on a double buffer as shown here: (c++ - Doubling buffering in CUDA so the CPU can operate on data produced by a persistent kernel - Stack Overflow). It works fine, but if I decide to instantiate an extremely innocuous class at some point in the main method, the program hangs itself with a segmentation fault:
Thread 1 "SOFAS" received signal SIGSEGV, Segmentation fault.
0x0000000000404724 in main () at ../src/SOFAS.cu:155
155 printf("0 bufrdy=%d \n", *bufrdy);
[Thread 0x7fffee9f0700 (LWP 3003) exited]
[Thread 0x7fffef272700 (LWP 3002) exited]
[Thread 0x7fffefa73700 (LWP 3001) exited]
Error: Failed to suspend device for CUDA device 0, error=CUDBG_ERROR_UNKNOWN(0x1).
Here is the main method:
int main() {
test t(10);
int *hBuf1, *dBuf1, *hBuf2, *dBuf2;
volatile int *bufrdy1Flag, *bufrdy2Flag;
// buffer and "mailbox" setup
//Allocate host memory for the buffers and the flags
cudaHostAlloc(&hBuf1, DSIZE * sizeof(int), cudaHostAllocDefault);
cudaHostAlloc(&hBuf2, DSIZE * sizeof(int), cudaHostAllocDefault);
cudaHostAlloc(&bufrdy1Flag, sizeof(int), cudaHostAllocMapped);
cudaHostAlloc(&bufrdy2Flag, sizeof(int), cudaHostAllocMapped);
cudaCheckErrors("cudaHostAlloc fail");
//Allocate device memory
cudaMalloc(&dBuf1, DSIZE * sizeof(int));
cudaMalloc(&dBuf2, DSIZE * sizeof(int));
cudaCheckErrors("cudaMalloc fail");
//Create the CUDA streams
cudaStream_t streamk, streamc;
cudaStreamCreate(&streamk);
cudaStreamCreate(&streamc);
cudaCheckErrors("cudaStreamCreate fail");
*bufrdy1Flag = 0;
*bufrdy2Flag = 0;
//Fill device memory buffers with 255's
cudaMemset(dBuf1, 0xFF, DSIZE * sizeof(int));
cudaMemset(dBuf2, 0xFF, DSIZE * sizeof(int));
cudaCheckErrors("cudaMemset fail");
// inefficient crutch for choosing number of blocks
int nblock = 0;
cudaDeviceGetAttribute(&nblock, cudaDevAttrMultiProcessorCount, 0);
cudaCheckErrors("get multiprocessor count fail");
printf("kernel launching with <<<%d,%d>>> \n",nblock, nTPB);
testkernel<<<nblock, nTPB, 0, streamk>>>(dBuf1, dBuf2, bufrdy1Flag,
bufrdy2Flag, DSIZE, ITERS);
cudaCheckErrors("kernel launch fail");
volatile int *bufrdy;
int *hbuf, *dbuf;
for (int i = 0; i < ITERS; i++) {
if (i%2) { // ping pong on the host side
bufrdy = bufrdy2Flag;
hbuf = hBuf2;
dbuf = dBuf2;
} else {
bufrdy = bufrdy1Flag;
hbuf = hBuf1;
dbuf = dBuf1;
}
// int qq = 0; // add for failsafe - otherwise a machine failure can hang
while ((*bufrdy) != 2); // use this for a failsafe: if (++qq > 1000000) {printf("bufrdy = %d\n", *bufrdy); return 0;} // wait for buffer to be full;
cudaMemcpyAsync(hbuf, dbuf, DSIZE * sizeof(int), cudaMemcpyDeviceToHost, streamc);
cudaStreamSynchronize(streamc);
cudaCheckErrors("cudaMemcpyAsync fail");
*bufrdy = 0; // release buffer back to device
if (!validate(hbuf, DSIZE, i)) {
printf("validation failure at iter %d\n", i);
exit(1);
}
}
printf("Completed %d iterations successfully\n", ITERS);
}
The weird thing is that if I comment that first line, test t(10), the whole thing works perfectly. The Test class is trivial, I include it here for completeness:
test.h
#ifndef __TEST_H_
#define __TEST_H_
#include <array>
class test {
public:
test(int a);
private:
// Sampling the radius estimates - velocities are sampled off an exponential distribution 2^-(x-t)-5
const double expBase = 2;
const double expVerticalShift = 5;
/** The maximum velocity estimate that can be sampled */
const double maxSampleValue = 8000;
/** The minimum velocity estimate that can be sampled */
const double minSampleValue = 0.1;
const double minSampleX = 0;
const double maxAccelerationSample = 500000;
const double timeShift=0;
const double maxSampleX=0;
//const double sampleFac=0;
//static const int numberOfRadiusEstimates = 50;
//std::array<double, numberOfRadiusEstimates> radiusLUT;
// Sampling the angle estimates
//static const int numberOfAngleEstimates = 8;
//std::array<std::array<double, 3>, numberOfAngleEstimates> angleLUT;
//Sampling the acceleration estimates
//static const int numberOfAccelerationEstimates = 1;
//std::array<double, numberOfAccelerationEstimates> accelerationLUT;
void init(int a);
};
#endif
test.cpp
#include "test.h"
#include <stdio.h>
test::test(int a) {
printf("generating class...\n");
init(a);
}
void test::init(int a) {
int *arr = new int[a];
for(int i=0; i<a; i++) {
arr[i]=a*i;
}
for(int i=0; i<a; i++) {
printf("arr(%d)=%d \n",i,arr[i]);
}
}
Now for the really weird part: You’ll notice that in test.h I’ve commented a bunch of random variables that I don’t actually ever use. Well if I uncomment any further than that, so if I uncomment line 20, it fails with the error message “Error: Failed to suspend device for CUDA device 0” - as it is now it works fine.
I really can’t seem to figure this out. Many thanks for your help!
Timo