Hello,
I am trying to get 14 concurrent kernels to work on a Tesla Kepler card (Tesla K20Xm with 14 SMX units).
If I launch them without recording any events before and after the launch, then I get exactly 14 concurrent kernels.
However, if I launch them with events, I only get 8 concurrent kernels.
Why is the use of events limiting me to 8 concurrent kernels?
Is there more information about:
- hardware queues (false intra-stream dependencies)
- implicit synchronization
- dependency checks
- delayed signals between sequentially issued kernel launches
- etc?
I thought most of these problems were solved on Kepler (HyperQ - 32 hardware queues)…
At the moment I only know about:
- http://docs.nvidia.com/cuda/index.html
- https://developer.nvidia.com/gpu-computing-webinars
- http://developer.download.nvidia.com/CUDA/training/StreamsAndConcurrencyWebinar.pdf (only Fermi)
Somehow I must have missed something…
events.cu:
#include <stdio.h>
#include <stdint.h>
#define NUM_STREAMS 14
__global__ void kernel_wait64(uint64_t delta) {
uint64_t start = clock64();
uint64_t stop = start + delta;
while(clock64() < stop); // you will need '-arch=sm_20' for this.
}
void debug_check(const char *func, int line) {
cudaError_t code = cudaGetLastError();
if(code != cudaSuccess) {
const char *msg = cudaGetErrorString(code);
printf("Error #%d: %s\n", code, msg);
printf("Function: %s\n", func);
printf("Line: %d\n", line);
exit(1);
}
}
int main(int argc, char **argv) {
cudaStream_t stream[NUM_STREAMS];
cudaEvent_t event_a[NUM_STREAMS];
cudaEvent_t event_b[NUM_STREAMS];
for(int i=0; i<NUM_STREAMS; i++) {
cudaStreamCreate(&stream[i]);
debug_check("cudaStreamCreate", __LINE__);
cudaEventCreate(&event_a[i]);
debug_check("cudaEventCreate", __LINE__);
cudaEventCreate(&event_b[i]);
debug_check("cudaEventCreate", __LINE__);
}
for(int i=0; i<NUM_STREAMS; i++) {
cudaEventRecord(event_a[i], stream[i]);
debug_check("cudaEventRecord", __LINE__);
// this kernel waits for 700 million clocks.
// so it waits less than 1 second on a Tesla K20Xm (Kepler).
kernel_wait64<<<1,1,0,stream[i]>>>(700L*1000L*1000L);
debug_check("cudaLaunch", __LINE__);
cudaEventRecord(event_b[i], stream[i]);
debug_check("cudaEventRecord", __LINE__);
}
cudaDeviceSynchronize();
debug_check("cudaDeviceSynchronize", __LINE__);
return 0;
}
nvcc: (you will need ‘-arch=sm_20’ for the clock64-function)
nvcc -O2 -arch=sm_20 -o events.out events.cu
Any help would be really appreciated.