I am getting some strange behavior when trying to read global memory. My application performs operations on elements in a large array, which is initialized to zero. The device code sets the first element of the data array to a value, then all subsequent data elements are copied from that element within the data elements array. I don’t expect all the values to be set, there are multiple cores running simultaneously so until the first block of threads complete the initial element will still be zero. The problem is that it’s accuracy is not that great, with a different number of errors each run.
[codebox]#include <stdio.h>
#include <cuda_runtime.h>
#define mesh 512
const dim3 dimBlock(512);
const dim3 dimGrid(mesh/dimBlock.x, mesh/dimBlock.y, 1);
const int Num_Elements = mesh*mesh;
static long *data_elements_h, *data_elements_d = NULL;
// Device Function
global void Process_Elements(long* data_elements_d)
{
int ThreadID = (blockIdx.y*gridDim.x+blockIdx.x)*blockDim.y*blockDim.x+thre
adIdx.y*blockDim.x+threadIdx.x;
if (ThreadID==0) data_elements_d[ThreadID] = 0xF;
else data_elements_d[ThreadID] = data_elements_d[0]; // Suspect read/write
__syncthreads();
}
// Host Function
#if DEVICE_EMULATION
bool InitCudaDevice(void) { return true; }
bool InitCudaDevice(void) { cudaSetDevice(0); return true; }
int main(int argc, char* argv)
{
long input_h = 0xF;
int elements_error = 0;
if (!InitCudaDevice()) return 0;
size_t Data_size = Num_Elements*sizeof(long);
data_elements_h = (long *)malloc(Data_size);
data_elements_h[0] = input_h; // Set first element to the input value
for (int ThreadID=1; ThreadID<Num_Elements; ThreadID++) data_elements_h[ThreadID] = 0; // Init other elements to zero
cudaMalloc((void **) &data_elements_d, Data_size);
cudaMemcpy(data_elements_d, data_elements_h, Data_size, cudaMemcpyHostToDevice);
Process_Elements<<<dimGrid, dimBlock, 0, 0>>>(data_elements_d);
cudaMemcpy(data_elements_h, data_elements_d, Num_Elements*sizeof(long), cudaMemcpyDeviceToHost);
for (int k=0; k<Num_Elements; k++) {
if (data_elements_h[k] != input_h) {
elements_error++;
//printf("data_elements_h(%d): %llX\n", k, data_elements_h[k]);
}
}
if (elements_error) printf("%d OK, %d Errors out of %d\n", Num_Elements-elements_error, elements_error, Num_Elements); else printf("No Element Errors\n");
free(data_elements_h);
cudaFree(data_elements_d);
cudaThreadExit();
return 0;
}[/codebox]
If I initialize the data with the first element already set then I get no errors and every data element gets the value 0xF after one iteration. However I need the code to run on the device without having to copy all the memory back and forth.
How do I determine how what span to leave before reading a value written by a previous thread? I’d like to be able to reference data based off the ThreadID like so:
data_elements_d[ThreadID] = data_elements_d[ThreadID - padding];
Thanks for any pointers,
John