Global variable in CUDA

Hi all;

I’m trying to attempt to get one kernel update an initial memory location with a value (in this example shown by the kernel name “insert”) and another kernel read the updated value from the same memory location used by the kernel name “insert” on the kernel name “read”. However, when the kernel “insert” set the memory location dev_pBF_bit_table[0x7277/0x7] and dev_pBitMask[0] to 180 and 9 respectively, the kernel “read”, read this memory location back as ‘0’.

I would like and expect the kernel “read” , read the memory location above as dev_pBF_bit_table[0x7277/0x7] = 180; and dev_pBitMask[0] to 180 = 9.

Can anyone of here provide some lights on how to do that ?

Below here is my code snippet :

__global__void insert(unsigned int *dev_pBitMask, unsigned int *dev_pBF_bit_table) {

	dev_pBF_bit_table[0x7277/0x7] = 180;
	dev_pBitMask[0] = 9; 

}

__global__void read(unsigned int *dev_pBitMask, unsigned int *dev_pBF_bit_table) {

	printf("DEBUG GPU :  0x%x, 0%x\n", dev_pBF_bit_table[0x7277/0x7],dev_pBitMask[0]);

}

int main()
{
// Instantiate a pointer for CUDA version of Bloom filter

unsigned int *dev_pBitMask = new unsigned int[8];
unsigned int *dev_pBF_bit_table = new unsigned int[8192];

for (int i = 0; i < 8; i++) {
   dev_pBitMask[i] = bit_mask[i];
}

for (int i = 0; i < filter2.BF_raw_table_size_; i++) {
	dev_pBF_bit_table[i] = filter2.BF_bit_table_[i]; 
}

		:
		:
		:
		:

	 const int dev_pBitMask_sizeof = (8)*sizeof(unsigned int); 
	 const int dev_pBF_bit_table_sizeof = (8192)*sizeof(unsigned int); 
 
	 cudaMalloc((void**)&dev_pBitMask, dev_pBitMask_sizeof); 
	 cudaMalloc((void**)&dev_pBF_bit_table,dev_pBF_bit_table_sizeof); 
	

	insert <<< 1, 8>>>(dev_pBitMask , dev_pBF_bit_table);

	match <<<1, 8>>> (dev_pBitMask , dev_pBF_bit_table);


	:
	:
	:

return 0; 

}

Especially when learning CUDA, or any time you are having trouble, you should always do proper cuda error checking by inspecting the return values of all cuda API calls. Most cuda sample codes have exampels of how to do error checking.

calling cudaMalloc on a host variable that you have already defined, allocated, and initialized on the host, does not create a device-accessible version with that initialization. You may want to review how to write a basic CUDA program including movement of data from host code to device code.

Your host code should look something like this:

int main()
{
// Instantiate a pointer for CUDA version of Bloom filter

unsigned int *host_pBitMask = new unsigned int[8];
unsigned int *host_pBF_bit_table = new unsigned int[8192];

for (int i = 0; i < 8; i++) {
host_pBitMask[i] = bit_mask[i];
}

for (int i = 0; i < filter2.BF_raw_table_size_; i++) {
host_pBF_bit_table[i] = filter2.BF_bit_table_[i];
}

const int dev_pBitMask_sizeof = (8)*sizeof(unsigned int);
const int dev_pBF_bit_table_sizeof = (8192)*sizeof(unsigned int);

unsigned int *dev_pBitMask;
unsigned int *dev_pBF_bit_table;

cudaMalloc((void**)&dev_pBitMask, dev_pBitMask_sizeof);
cudaMalloc((void**)&dev_pBF_bit_table,dev_pBF_bit_table_sizeof);

cudaMemcpy(dev_pBitMask, host_pBitMask, dev_pBitMask_sizeof, cudaMemcpyHostToDevice);
cudaMemcpy(dev_pBF_bit_table, host_pBF_bit_table, dev_pBF_bit_table_sizeof, cudaMemcpyHostToDevice);

insert <<< 1, 8>>>(dev_pBitMask , dev_pBF_bit_table);

match <<<1, 8>>> (dev_pBitMask , dev_pBF_bit_table);

cudaDeviceSynchronize();

return 0;