Hi all,
I’ve encountered an issue with inconsistent execution times when running a simple CUDA kernel sequentially in a single process. Specifically, I’m performing a basic vector addition (A + B = C). The first execution takes about 10ms, but the second execution shows an almost 0ms execution time. I suspect that the kernels are somehow being launched simultaneously, despite my intention to execute them sequentially.
To illustrate, if I run two kernels with execution times of 10ms and 15ms respectively, the second one incorrectly reports an execution time of 5ms when executed sequentially.
Here are the details of my setup:
- CUDA Version: 12.5
- Array Size: 50* 1024* 1024
- Threads Per Block: 1024
- Blocks Per Grid: Calculated based on array size and threads per block
Below is the full code:
#include <iostream>
#include <iomanip>
#include <chrono>
#include <thread>
#include <assert.h>
#include <vector>
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <cuda.h>
#include <cuda_runtime.h>
static inline void checkDrvError(CUresult res, const char *tok, const char *file, unsigned line) {
if (res != CUDA_SUCCESS) {
const char *errStr = NULL;
(void)cuGetErrorString(res, &errStr);
std::cerr << file << ':' << line << ' ' << tok << "failed (" << (unsigned)res << "): " << errStr << std::endl;
}
}
#define CHECK_DRV(x) checkDrvError(x, #x, __FILE__, __LINE__);
typedef std::chrono::nanoseconds ReportingDuration;
#define CHECK_CUDA(ans) {check_cuda((ans), __FILE__, __LINE__);}
inline void check_cuda(int code, const char *file, int line, bool abort=true) {
if (code != 0) {
fprintf(stderr, "[customHook][%s:%3d]:CUDAERROR: %d\n", file, line, code);
if (abort) exit(code);
}
}
__global__ void add(int *a, int *b, int *c, int nElements) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < nElements) {
c[i] = a[i] + b[i];
}
}
double what_time_is_it_now() {
struct timespec time;
if (clock_gettime(CLOCK_MONOTONIC, &time) == -1) exit(-1);
return (double)time.tv_sec + (double)time.tv_nsec * 0.000000001;
}
int main() {
size_t array_size = 50*1024*1024;
int nElements = array_size / sizeof(int);
int *A, *B, *C;
int threadsPerBlock = 1024;
int blocksPerGrid = (nElements + threadsPerBlock - 1) / threadsPerBlock;
double start, end;
assert(array_size % (2*1024*1024) == 0);
CHECK_CUDA(cudaMalloc(&A, array_size));
CHECK_CUDA(cudaMalloc(&C, array_size));
CHECK_CUDA(cudaMalloc(&B, array_size));
// Fill up the A & B
int *host_A, *host_B;
host_A = (int *)malloc(array_size);
host_B = (int *)malloc(array_size);
for (int i = 0; i < nElements; i++) {
host_A[i] = 2;
host_B[i] = 4;
}
CHECK_CUDA(cudaMemcpy(A, host_A, array_size, cudaMemcpyHostToDevice));
CHECK_CUDA(cudaMemcpy(B, host_B, array_size, cudaMemcpyHostToDevice));
start = what_time_is_it_now();
add<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, nElements);
end = what_time_is_it_now();
printf("VectorAdd: %.4fms\n", (end - start) * 1000);
start = what_time_is_it_now();
add<<<blocksPerGrid, threadsPerBlock>>>(B, C, A, nElements);
end = what_time_is_it_now();
printf("VectorAdd: %.4fms\n", (end - start) * 1000);
return 0;
}
Questions:
- Why does the second kernel execution report a much shorter execution time?
- Is there a possible overlap or concurrency issue with the kernel launches?
- How can I ensure that the kernels execute sequentially and measure their execution times accurately?
Any insights or suggestions would be greatly appreciated.
Thanks in advance!