kernel FIFO queue length

If I schedule a lot of kernels, it appears that, on my machine, the first 1023 launches return immediately, and subsequent launches have to wait, presumably for the kernel FIFO queue to stay under 1024. I wonder if this limit can be changed? I couldn’t find anything about this in the documentation.

I’m attaching an example that demonstrates this behavior (derived from another poster’s code).

#include <iostream>
#include <ctime>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

__global__ void FindClosestGPU(float3* points, size_t* indices, size_t count)
{
        if(count <= 1) return;

        size_t id = threadIdx.x + blockIdx.x * blockDim.x;
        if(id < count)
        {
                float3 thisPoint = points[id];
                float smallestSoFar = 3.40282e38f;

                for(size_t i = 0; i < count; i++)
                {
                        if(i == id) continue;

                        float dist = (thisPoint.x - points[i].x) * (thisPoint.x - points[i].x);
                        dist += (thisPoint.y - points[i].y) * (thisPoint.y - points[i].y);
                        dist += (thisPoint.z - points[i].z) * (thisPoint.z - points[i].z);

                        if(dist < smallestSoFar)
                        {
                                smallestSoFar = dist;
                                indices[id] = i;
                        }
                }
        }
}


int main(int argc, char** argv)
{
    int num_launches = 1040;
    if(argc >= 2)
        num_launches = std::atol(argv[1]);

        const size_t count = 10000;

        // Arrays of points
        size_t *indexOfClosest = (size_t *)malloc(count * sizeof(size_t));
        float3 *points = (float3 *)malloc(count * sizeof(float3));

        float3 *d_points;
        size_t *d_indexOfClosest;

        cudaMalloc(&d_points, count * sizeof(float3));
        cudaMalloc(&d_indexOfClosest, count * sizeof(size_t));

        // Create a list of random points
        for(size_t i = 0; i < count; i++)
        {
                points[i].x = (float)((rand()%10000) - 5000);
                points[i].y = (float)((rand()%10000) - 5000);
                points[i].z = (float)((rand()%10000) - 5000);
        }

        cudaMemcpy(d_points, points, sizeof(float3) * count, cudaMemcpyHostToDevice);

    for(size_t j = 0; j < num_launches; ++j) {
                long startTime = std::clock();
        FindClosestGPU<<<(count / 1024) + 1, 1024>>>(d_points, d_indexOfClosest, count);
                long finishTime = std::clock();

        std::cout << j << " " << double(finishTime - startTime) / CLOCKS_PER_SEC * 1000 << " ms." << std::endl;
        }
    cudaMemcpy(indexOfClosest, d_indexOfClosest, sizeof(size_t) * count, cudaMemcpyDeviceToHost);

    std::cout << "DONE" << std::endl;

        // Print the final results to screen
        std::cout << "Final results:" << std::endl;
        for(size_t i = 0; i < 10; i++)
                std::cout << i << "." << indexOfClosest[i] << std::endl;

        // Deallocate RAM
        free(points);
        free(indexOfClosest);

        cudaFree(d_indexOfClosest);
        cudaFree(d_points);

        cudaDeviceReset();

        return 0;
}

if you can not adjust the queue (and i doubt), an option would be to ‘monitor’ it, likely via an event, to know when to issue again

issue kernel batch
record event
issue kernel batch

while not done
{
[do something else in the meantime]
wait for event
record event
issue kernel batch
}

Thanks

should probably have mentioned that in the example, kernel batch = half of known queue depth, and the host always have 1 batch issued in advance, such that the device can not ‘catch up with’ the host