Free memory after finished kernel

I have the following condition:
a struct Lock (with constructor and destructor) pass as parameter to kernel1 after finish kernel1 seems that kernel1 call the destructor in exit of kernel1 why this happens if it has been declared out of scope? continuing…
when kernel2 is invoked the values are displayed on screen but when I execute with cuda-memcheck gives a lot error due that kernel1 call automatically the destructor of Lock. How can prevent this?

//nvcc -rdc=true -arch=sm_35 -o t605 t605.cu
//sudo optirun --no-xorg ./t605

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>

struct Lock{
	int *indexAllow;
	__host__ Lock(void){
		int startVal = 0;
		cudaMalloc((void**) &indexAllow, sizeof(int));
		cudaMemcpy(indexAllow, &startVal, sizeof(int), cudaMemcpyHostToDevice);
	}
	__host__ ~Lock(void){
		cudaFree(indexAllow);
	}
	__device__ void lock(){
		while( atomicCAS(indexAllow, 0, 1) != 0 ) ;
	}
	__device__ void unlock(){
		atomicExch( indexAllow , 0 ) ;
	}
	__device__ int get_indexAllow(){
		return *indexAllow;
	}
};

__global__ void kernel_child(int *result){
	atomicAdd(result, 1);
}

__global__ void kernel(Lock mylock, int *count, int *valor){
	int tid = threadIdx.x + (blockIdx.x * blockDim.x);	
	//~ int *mutex = new int(0);
	int *contador = new int(0);
	
	for (int i=0; i<32; i++) {
		if ((tid % 32) == i) {
			mylock.lock();
			kernel_child<<<10,10>>>(contador);
			cudaDeviceSynchronize();
			*valor = *valor + *contador;
			mylock.unlock();
		}
		if ((tid % 32) == i && threadIdx.x == 0) {
			//Not displayed in order here
			mylock.lock();
			printf("%d %d\n", blockIdx.x, *valor);
			mylock.unlock();
		}
	}
	
	//~ for (int i=0; i<32; i++) {
		//~ if ((tid % 32) == i) {
			//~ while(atomicCAS(count, 0, 1) != 0);
			//~ exec = exec + 1;
			//~ *valor = *valor + 1;
			//~ atomicExch(count, 0);
		//~ }
	//~ }

	__syncthreads();
	delete contador;
	contador = NULL;
}

__global__ void kernel2(Lock mylock){
	printf("%d %d\n", threadIdx.x, mylock.get_indexAllow());
}

int main(int argc, char **argv){

	Lock mylock;
	int *mutex, *c, cin=0, *cout = new int;
	
	cudaMalloc((void**) &c, sizeof(int));
	cudaMemcpy(c, &cin, sizeof(int), cudaMemcpyHostToDevice);
	cudaMalloc((void**) &mutex, sizeof(int));
	cudaMemset(mutex, 0, sizeof(int));
	
	/* Executing kernel 1, works fine */
	kernel<<<10,100>>>(mylock, mutex, c);
	cudaDeviceSynchronize();
	cudaMemcpy(cout, c, sizeof(int), cudaMemcpyDeviceToHost);
	printf("c: %d\n", *cout);
	cudaFree(c);
	delete cout;
	cout = NULL;
	
	/* Executing kernel 2, works fine but gives errors in cuda-memcheck */
	kernel2<<<1,2>>>(mylock);
	cudaDeviceSynchronize();
	
	return 0;
}

Output of ./t605:

Output of cuda-memcheck ./t605:

One possible idea:

Don’t pass-by-value the Lock struct as a kernel parameter. I think this concept originated in cuda-by-example a long time ago, but there’s no particular reason it has to be this way, and the constructor/destructor calling sequence for objects passed by value as kernel parameters is, IMO, somewhat confusing anyway.

Separate the methods from the data. Pass the lock data as kernel parameters. Let the methods be ordinary function calls that take the lock data to operate on as a parameter.

not pass-by-value but by-reference?

Something like this?

__device__ void lock(int *indexToLock); //where indexToLock is a global variable?

No, kernel parameters cannot be pass-by-reference. It’s illegal (except in UM setting.)

Yes, something like that. There are plenty of examples on the internet if you care to search for them.

Let’s be clear that using locks like this is troublesome. I’m not saying its a great idea. I’m suggesting a method to work around your constructor/destructor/kernel parameter issue.

Thanks txbob, work’s fine without structure.