Host and device spinning on the same flag

Hi, I’m new to CUDA programming, and I’m currently implementing a simple program that achieves the following functionality:

  1. When the device needs to read data from the file system, it sends a ‘request’ to host asking for data transfer.
  2. Host reads data from the file system and copies them to device memory.
  3. Host notifies device that the data transfer has finished.

In order to implement this, I’m following the following step:

  1. The host and device spins on a ready flag. The ready flag is pinned memory so that host and device can both have access to.
  2. When the device requests data transfer, it sets the ready flag to true. Since the host is spinning on the ready flag, the host can then read from the file system and transfer the data to device memory.
  3. The host sets the ready flag to false after finishing the data transfer. Then GPU knows that it can read its local memory.

Say that the device kernel requests data transfer for 2000 times. Then I wrote my CPU and GPU kernels as below:

  
__global__ void GPU_KERNEL(int* dev_spin, char* dev_mem)
{
        int counter = 0;
        while (counter < 2000)
        {
                if (*dev_spin == 1)
                {
                        counter++;
                        *dev_spin = 0;
                }
        }
}

int main()
{
        /* Some initialization */

        /* CPU kernel */
        int counter = 0;
        while (1)
        {
                if (*host_spin == 0)
                {
                        fread(host_mem, 4*1024, 1, f);
                        HANDLE_ERROR(cudaMemcpyAsync(dev_mem, host_mem, 4*1024, cudaMemcpyHostToDevice, 0));
                        counter++;
                        *host_spin = 1;
                }
                if (cudaStreamQuery(0) == cudaSuccess) break;
        }
}

  

When I run this program, it turns out that GPU can only request data transfer for 965 times, and more requests would cause the GPU kernel to run into an infinite loop. I’m wondering if this is due to race conditions between host and device or a hardware limitation of GPU, since the data transfer is always carried out 965 times whenever I run the program (on different machines). I’ve been stuck with this problem for several days, and I would appreciate any help or thoughts on this issue.

Below is the full version of my code:

  
__global__ void GPU_KERNEL(int* dev_spin, char* dev_mem)
{
        int counter = 0;
        while (counter < 2000)
        {
                if (*dev_spin == 1)
                {
                        counter++;
                        *dev_spin = 0;
                }
        }
}

int main()
{
        /* a shared memory location that CPU and GPU can access */
        int* host_spin = (int*)calloc(1, sizeof(int));

        /* pin the shared memory */
        HANDLE_ERROR(cudaHostRegister(host_spin, sizeof(int), cudaHostRegisterMapped));

        /* Zero copy the shared memory */
        int* dev_spin;
        HANDLE_ERROR(cudaHostGetDevicePointer(&dev_spin, host_spin, 0));

        /* Initialize the shared memory */
        *host_spin = 1;

        /* allocate the CPU and GPU memory for data copying */
        char* host_mem = (char*)calloc(4*1024,1);
        char* dev_mem;
        HANDLE_ERROR(cudaMalloc((void**)&dev_mem, 4*1024));

        /* Open the file */
        FILE* f = fopen("file.txt", "r");

        /* Create the GPU kernel */
        GPU_KERNEL <<<>>> (dev_spin, dev_mem);


        int counter = 0;
        while (1)
        {
                if (*host_spin == 0)
                {
                        fread(host_mem, 4*1024, 1, f);
                        HANDLE_ERROR(cudaMemcpyAsync(dev_mem, host_mem, 4*1024, cudaMemcpyHostToDevice, 0));
                        counter++;
                        *host_spin = 1;
                }
                if (cudaStreamQuery(0) == cudaSuccess) break;
        }
        cudaHostUnregister(host_spin);
        return 0;
}