I have the following condition:
a struct Lock (with constructor and destructor) pass as parameter to kernel1 after finish kernel1 seems that kernel1 call the destructor in exit of kernel1 why this happens if it has been declared out of scope? continuing…
when kernel2 is invoked the values are displayed on screen but when I execute with cuda-memcheck gives a lot error due that kernel1 call automatically the destructor of Lock. How can prevent this?
//nvcc -rdc=true -arch=sm_35 -o t605 t605.cu
//sudo optirun --no-xorg ./t605
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
struct Lock{
int *indexAllow;
__host__ Lock(void){
int startVal = 0;
cudaMalloc((void**) &indexAllow, sizeof(int));
cudaMemcpy(indexAllow, &startVal, sizeof(int), cudaMemcpyHostToDevice);
}
__host__ ~Lock(void){
cudaFree(indexAllow);
}
__device__ void lock(){
while( atomicCAS(indexAllow, 0, 1) != 0 ) ;
}
__device__ void unlock(){
atomicExch( indexAllow , 0 ) ;
}
__device__ int get_indexAllow(){
return *indexAllow;
}
};
__global__ void kernel_child(int *result){
atomicAdd(result, 1);
}
__global__ void kernel(Lock mylock, int *count, int *valor){
int tid = threadIdx.x + (blockIdx.x * blockDim.x);
//~ int *mutex = new int(0);
int *contador = new int(0);
for (int i=0; i<32; i++) {
if ((tid % 32) == i) {
mylock.lock();
kernel_child<<<10,10>>>(contador);
cudaDeviceSynchronize();
*valor = *valor + *contador;
mylock.unlock();
}
if ((tid % 32) == i && threadIdx.x == 0) {
//Not displayed in order here
mylock.lock();
printf("%d %d\n", blockIdx.x, *valor);
mylock.unlock();
}
}
//~ for (int i=0; i<32; i++) {
//~ if ((tid % 32) == i) {
//~ while(atomicCAS(count, 0, 1) != 0);
//~ exec = exec + 1;
//~ *valor = *valor + 1;
//~ atomicExch(count, 0);
//~ }
//~ }
__syncthreads();
delete contador;
contador = NULL;
}
__global__ void kernel2(Lock mylock){
printf("%d %d\n", threadIdx.x, mylock.get_indexAllow());
}
int main(int argc, char **argv){
Lock mylock;
int *mutex, *c, cin=0, *cout = new int;
cudaMalloc((void**) &c, sizeof(int));
cudaMemcpy(c, &cin, sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc((void**) &mutex, sizeof(int));
cudaMemset(mutex, 0, sizeof(int));
/* Executing kernel 1, works fine */
kernel<<<10,100>>>(mylock, mutex, c);
cudaDeviceSynchronize();
cudaMemcpy(cout, c, sizeof(int), cudaMemcpyDeviceToHost);
printf("c: %d\n", *cout);
cudaFree(c);
delete cout;
cout = NULL;
/* Executing kernel 2, works fine but gives errors in cuda-memcheck */
kernel2<<<1,2>>>(mylock);
cudaDeviceSynchronize();
return 0;
}
Output of ./t605:
Output of cuda-memcheck ./t605: