If I schedule a lot of kernels, it appears that, on my machine, the first 1023 launches return immediately, and subsequent launches have to wait, presumably for the kernel FIFO queue to stay under 1024. I wonder if this limit can be changed? I couldn’t find anything about this in the documentation.
I’m attaching an example that demonstrates this behavior (derived from another poster’s code).
#include <iostream>
#include <ctime>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
__global__ void FindClosestGPU(float3* points, size_t* indices, size_t count)
{
if(count <= 1) return;
size_t id = threadIdx.x + blockIdx.x * blockDim.x;
if(id < count)
{
float3 thisPoint = points[id];
float smallestSoFar = 3.40282e38f;
for(size_t i = 0; i < count; i++)
{
if(i == id) continue;
float dist = (thisPoint.x - points[i].x) * (thisPoint.x - points[i].x);
dist += (thisPoint.y - points[i].y) * (thisPoint.y - points[i].y);
dist += (thisPoint.z - points[i].z) * (thisPoint.z - points[i].z);
if(dist < smallestSoFar)
{
smallestSoFar = dist;
indices[id] = i;
}
}
}
}
int main(int argc, char** argv)
{
int num_launches = 1040;
if(argc >= 2)
num_launches = std::atol(argv[1]);
const size_t count = 10000;
// Arrays of points
size_t *indexOfClosest = (size_t *)malloc(count * sizeof(size_t));
float3 *points = (float3 *)malloc(count * sizeof(float3));
float3 *d_points;
size_t *d_indexOfClosest;
cudaMalloc(&d_points, count * sizeof(float3));
cudaMalloc(&d_indexOfClosest, count * sizeof(size_t));
// Create a list of random points
for(size_t i = 0; i < count; i++)
{
points[i].x = (float)((rand()%10000) - 5000);
points[i].y = (float)((rand()%10000) - 5000);
points[i].z = (float)((rand()%10000) - 5000);
}
cudaMemcpy(d_points, points, sizeof(float3) * count, cudaMemcpyHostToDevice);
for(size_t j = 0; j < num_launches; ++j) {
long startTime = std::clock();
FindClosestGPU<<<(count / 1024) + 1, 1024>>>(d_points, d_indexOfClosest, count);
long finishTime = std::clock();
std::cout << j << " " << double(finishTime - startTime) / CLOCKS_PER_SEC * 1000 << " ms." << std::endl;
}
cudaMemcpy(indexOfClosest, d_indexOfClosest, sizeof(size_t) * count, cudaMemcpyDeviceToHost);
std::cout << "DONE" << std::endl;
// Print the final results to screen
std::cout << "Final results:" << std::endl;
for(size_t i = 0; i < 10; i++)
std::cout << i << "." << indexOfClosest[i] << std::endl;
// Deallocate RAM
free(points);
free(indexOfClosest);
cudaFree(d_indexOfClosest);
cudaFree(d_points);
cudaDeviceReset();
return 0;
}