I’m newbee in CUDA and am trying to convert my ray tracer into CUDA version, but I can’t stop getting error “unspecified launch failure” on
gpuErrchk( cudaMemcpy(buffer, dev_buffer, sizeof(CudaVec)*w*h, cudaMemcpyDeviceToHost));
. This sentence follows the kernel and I googled a bit it is said to be something related to memory operations inside the kernel. Then, I deleted most of my code and what’s left is:
The host part:
void CudaInit(CudaGeometry* geos, int _n, double* h, double* randNum, int _randN)
{
// cudaOutput = fopen("cudaoutput.txt", "w");
hits=h;
printf("cuda part init %d\n", _n);
fflush(stdout);
if(!geos)
{
return;
}
if(!dev_geos)
{
gpuErrchk(cudaFree(dev_geos));
gpuErrchk(cudaFree(dev_hits));
gpuErrchk(cudaFree(dev_randNum))
}
n = _n;
randN = _randN;
printf("CUDA: sizeof(CudaGeometry)=%d, sizeof(CudaVec)=%d, n=%d\n", sizeof(CudaGeometry), sizeof(CudaVec), n);
gpuErrchk( cudaMalloc((void**)&dev_geos, sizeof(CudaGeometry)*n));
printf("cudaMalloc((void**)&dev_geos, sizeof(CudaGeometry)*n)\n");
gpuErrchk( cudaMalloc((void**)&dev_hits, sizeof(double)*n));
printf("cudaMalloc((void**)&dev_hits, sizeof(double)*n)\n");
gpuErrchk( cudaMalloc((void**)&dev_randNum, sizeof(double)*randN));
printf("cudaMalloc((void**)&dev_randNum, sizeof(double)*randN)\n");
gpuErrchk( cudaMemcpy(dev_geos, geos, sizeof(CudaGeometry)*n, cudaMemcpyHostToDevice));
printf("cudaMemcpy(dev_geos, geos, sizeof(CudaGeometry)*n, cudaMemcpyHostToDevice)\n");
gpuErrchk( cudaMemcpy(dev_randNum, randNum, sizeof(double)*randN, cudaMemcpyHostToDevice));
printf("cudaMemcpy(dev_randNum, randNum, sizeof(double)*randN, cudaMemcpyHostToDevice)\n");
gpuErrchk( cudaPeekAtLastError() );
fflush(stdout);
}
void CudaRender(int w, int h, CudaVec camera, CudaVec up, CudaVec forward, CudaVec right, CudaVec* buffer)
{
CudaVec* dev_buffer;
memset(buffer, 0, sizeof(CudaVec)*w*h);
gpuErrchk( cudaMalloc((void**)&dev_buffer, sizeof(CudaVec)*w*h));
printf("start cuda render\n");
printf("buffer.size=%d %d\n", sizeof(buffer), sizeof(buffer)/sizeof(CudaVec));
fflush(stdout);
CudaMonteCarloRender<<<dim3(w, h), 1>>>(dev_geos, n, w, h, camera, up, forward, right, dev_buffer, dev_randNum, randN);
fflush(stdout);
printf("end cuda render\n");
fflush(stdout);
gpuErrchk( cudaMemcpy(buffer, dev_buffer, sizeof(CudaVec)*w*h, cudaMemcpyDeviceToHost));
gpuErrchk( cudaFree(dev_buffer));
}
My kernel:
// empty function
__device__ void CudaMonteCarloSample(CudaGeometry* geolist, int n, CudaVertex o, CudaRay i, double* randNum, int randN, CudaVec& result)
{
// result = (i.n.Vec3()+CudaVec(1.0, 1.0, 1.0))/2;
return;
}
__global__ void CudaMonteCarloRender(CudaGeometry* geolist, int n, int w, int h, CudaVec camera, CudaVec up, CudaVec forward, CudaVec right, CudaVec* buffer, double* randNum, int randN)
{
if(n==0) return;
int xx=blockIdx.x;
int yy=blockIdx.y;
if(xx<0 || xx>=w || yy<0 || yy>=h) return;
int index = xx+yy*gridDim.x;
CudaRay ray = GetRay(xx, yy, w, h, camera, up, forward, right); // generate a ray from camera
buffer[index] = CudaVec(0, 0, 0);
for(int sp=0; sp<SampleNum; sp++)
{
double mind=1e20;
CudaVertex minp(false);
// this checks the result of GetRay
buffer[index] = buffer[index]+(ray.n.Vec3()+CudaVec(1.0, 1.0, 1.0))/2;
for(int gi=0; gi<n; gi++)
{
CudaVertex hp(false);
CudaGeometry geo = geolist[gi];
ray.IntersectGeo(geo, hp);
if(hp.valid==true)
{
double d=(hp.p.Vec3()-camera).Length();
if(d>1e-3 && d<mind)
{
mind=d;
minp=hp;
}
}
}
if(minp.valid)
{
CudaVec result = buffer[index];
CudaMonteCarloSample(geolist, n, minp, ray, randNum, randN, result);
buffer[index] = CudaVec(0.0, 0.0, 0.0);
}
}
buffer[index] = buffer[index]/SampleNum;
return;
}
I can’t figure out what’s wrong with this code. And what’s most wired is that if I delete
buffer[index] = CudaVec(0.0, 0.0, 0.0);
on line 48 in the kernel, there’s no error.
Thank you all!
montecarlosample.cu (15.5 KB)