second kernel call results in segmentation fault and other annoying problems

all that kernel 1 does is initialize some data.

why is it that when I execute a call to kernel 1 there is according to cudaGetLastError no error but when I make a second identical call to that same kernel immediately after then I get a segmentation fault? I also get similar segmentation faults for other different kernels.

why is it that when in emulation mode the kernel produces the correct initialization (non-zero) as read from inside the kernel, but when I try to memCopy that data back and print it out in host code it is all zero?

why is it that when I try to memCopy some data back from the first kernel that I get an invalid device pointer error? (this may well cause the above error)

why is it that when immediately after the first call to kernel 1 I cudaFree the memory on the device and immediately reallocate it I don’t get the segmentation fault?

I thought you could allocate memory on the device via cudaMalloc only once and then refer to it in as many subsequent kernel calls

as required. It now seems I need deallocate and the reallocate.

This is why I asked about device variables with multiple GPUs.

You can.

Are you calling cudaThreadSynchronize before cudaGetLastError() after the first kernel call?

I’m guessing your first kernel call is writing beyond allocated memory and so all bets are off concerning any amount of reproducibility or sanity when dealing with any following calls made. Valgrind can be an invaluable tool in finding these issues. Compile in emulation mode with debug symbols and run through valgrind to help identify the culprit. Also check for the passing of host pointers to the device. This will work in emulation but will fail with ULFs on the device.

Besides these general principles to debugging these problems, what do you expect us to do to help given the information you’ve presented? To really find the problem we are going to need a full and minimal sample code that demonstrates it so we can tell where you are going wrong.

Yes I am calling cudaThreadSynchronize before cudaGetLastError() .

I have attached some code below…a very simple initialization on the GPU.

What I would like to know is

  1. why is it that when I allocate the array d_type using cudaMalloc I get a segmentation fault on the second call to the kernel, but when I allocate the same array as a device in the preprocessor I do not get a segmentation fault?

  2. why is it that in both cases above I always get an invalid device pointer error when I call cudaMemcpy to copy device data back into the host array h_type?

I am compiling wth the command

nvcc -o Obstacleemu Obstacle.cu -L/home/chrism/CUDA2/lib -lcutil -L/opt/cuda/lib -lcudart -I/home/chrism/CUDA2/common/inc -deviceemu -lcuda -lglut -lpthread (I haven’t bothered to delete some of the pthread and partice setup stuff in the code)

Any suggestions will be welcome because this is driving me crayzee!

[codebox]#include <pthread.h>

#include <semaphore.h>

#include <stdio.h>

// includes, system

#include <math.h>

#include <stdlib.h>

#include <cutil.h>

#include <cutil_math.h>

#include <multithreading.h>

#include <cuda.h>

#include <assert.h>

#include <cuda_runtime.h>

#include <cuda_runtime_api.h>

#include <device_launch_parameters.h>

#define PARAVIEW

#define NGPU 1

#define NSYM 0

#define MAXITS 20 //5000 //12000 //5000 //7500

////////////////////////////////////////

//set up #2 for 128*128 real particles

////////////////////////////////////////

//real particles

#define NXREAL 128

#define NYREAL 128

#define NREAL 16384 //NXREAL*NYREAL

//virtual particles

#define NXVIRT 960 //224 //160 //num virt particles in floor, 16 per metre

#define NYVIRTL 640 //128 //num virt particles in left hand wall, 16 per metre

#define NYVIRTR1 64 //16 //num virt particles in slope

#define NYVIRTR2 640 //144 //16 //num virt particles in right hand wall #2

#define NVIRT 2304 //512 //384 //320 //NXVIRT + NYVIRTL + NYVIRTR

//total particles

#define NTOTAL 18688 //1408 //1344 //NREAL + NVIRT //=NUMBLOCKS*NUMTHREADS

//grid definition

#define NUMBLOCKS 292 //88 //84

#define NUMTHREADS 64 //64 //NUMBLOCKS*NUMTHREADS = NTOTAL

#define NUMREALBLOCKS 256

#define NUMVIRTBLOCKS 36

#define XL 16

#define YL 16

#define DXL 0.125

#define DYL 0.125

#define DIM 2

#define XSLOPE 20.0

#define XRWALL 60.0

#define GRAVITY 9.81

#define PI 3.141592653

#define EPSILON 0.5

#define R0 0.1

#define NUMSTRIDES 1

#define STRIDEBLOCKS 292 //=NUMBLOCKS/NUMSTRIDES //146 for NUMSTRIDES=2, 73 for NUMSTRIDES=4

#define GPUBLOCK1 NTOTAL/NGPU

#define GPUBLOCK2 NREAL/NGPU

////////////////////////////////////////

//END set up #2 for 128*128 real particles

////////////////////////////////////////

device int type[NTOTAL];

//global void input1(int stride, int gputhread,int* itype);

global void input1(int stride, int gputhread);

//output

FILE* outfile;

main( int argc, char *argv )

{

outfile = fopen("gpuresult.txt","w");

//CUdeviceptr d_itype;



int	*h_type;

int	*d_type;

//cudaError_t	error;

int	device = 1;

int	gputhread = 0;

int	threadstride = gputhread*GPUBLOCK1;

fprintf(outfile,"\nthreadstride = %i",threadstride);

cudaDeviceProp deviceProp;

   CUDA_SAFE_CALL(cudaGetDeviceProperties(&deviceProp, device));

printf("\nGPU Thread %d uses device %d: “%s”\n", gputhread, device, deviceProp.name);

//Set device

	CUDA_SAFE_CALL( cudaSetDevice(device) );

printf("\n\ncudaSetDevice %s\n", cudaGetErrorString(cudaGetLastError()));

CUT_SAFE_MALLOC(h_type = (int *)malloc(NTOTAL * sizeof(int)) );

//CUDA_SAFE_CALL( cudaMalloc((void**)&d_type,NTOTAL*sizeof(int)) );

printf("\n\ncudaMalloc %s\n", cudaGetErrorString(cudaGetLastError()));

//cuMemAlloc(&d_itype, NTOTAL * sizeof(int));

//printf("\n\ncuMemAlloc %s\n", cudaGetErrorString(cudaGetLastError()));

//input1<<<NUMBLOCKS/NGPU, NUMTHREADS>>>(GPUBLOCK1,gputhread,d_type);

input1<<<NUMBLOCKS/NGPU, NUMTHREADS>>>(GPUBLOCK1,gputhread);

cudaThreadExit();

cudaThreadSynchronize();

printf("\n\ninput1 %s\n", cudaGetErrorString(cudaGetLastError()));	

	CUT_CHECK_ERROR("Input1() execution failed.\n");

CUDA_SAFE_CALL( cudaMemcpy(h_type, &d_type, GPUBLOCK1 * sizeof(int), cudaMemcpyDeviceToHost) );

printf("\n\ncudaMemcpy %s\n", cudaGetErrorString(cudaGetLastError()));

//CUDA_SAFE_CALL( cudaFree(d_type) );

//CUDA_SAFE_CALL( cudaMalloc((void**)&d_type,NTOTAL*sizeof(int)) );

//input1<<<NUMBLOCKS/NGPU, NUMTHREADS>>>(GPUBLOCK1,gputhread,d_type);

input1<<<NUMBLOCKS/NGPU, NUMTHREADS>>>(GPUBLOCK1,gputhread);

fprintf(outfile,"\nend input");

cudaThreadExit();

cudaThreadSynchronize();

printf("\n\ninput1 %s\n", cudaGetErrorString(cudaGetLastError()));	

	CUT_CHECK_ERROR("Input1() execution failed.\n");

CUDA_SAFE_CALL( cudaMemcpy(h_type, &d_type, GPUBLOCK1 * sizeof(int), cudaMemcpyDeviceToHost) );

printf("\n\ncudaMemcpy %s\n", cudaGetErrorString(cudaGetLastError()));	



//cuMemFree(d_itype);

free(h_type);

//CUDA_SAFE_CALL( cudaFree(d_type) );

return 0;

} /* end main */

//////////////////////////////////////

//////////////////////////////////////

//////////////////////////////////////

//////////////////////////////////////

//////////////////////////////////////

//global void input1(int stride, int gputhread,int* type)

global void input1(int stride, int gputhread)

{

int	locali,globali;

locali = blockDim.x * blockIdx.x + threadIdx.x;

globali = gputhread*stride + locali;	



if(globali<NREAL) 

{

	

    	type[locali] = 2;

}

else

{	

  	type[locali] = -2;

}

}

[/codebox]

You are calling cudaThreadExit() after this first kernel call. This shuts down the context and everything you had in it on the GPU, including freeing any allocated memory. So you are getting expected behavior.

You never need to call cudaThreadExit() yourself. The runtime will automatically call it when the current host thread terminates.

OK, but that does not explain why I get invalid device pointer errors when I try to cuMemcpy back from the device.

I do not get invalid device pointer errors if

  1. I declare a variable X on the device as device X in the preprocessor

  2. I declare another device variable d_X in main and pass that to the device via a kernel which copies device X into d_X

  3. I then cuMemcpy d_X into a host variable host_X and print host_X.

Dynamically allocated memory is freed on cudaThreadExit(). Attempting to use a freed pointer results in undefined behavior.

The device X variables might reside in the same location when the context is shut down and restarted, but it is unsafe to depend on this behavior. It’s certainly unsafe to depend on them retaining their values.