I am a beginner in CUDA programming and have developed a test code to run on the GPU.
Code Overview:
My code launches three kernels, each executing with a single block and a single thread:
- Kernel 1 and Kernel 2 communicate via a flag in device memory. Kernel 1 sets the flag, and Kernel 2 reads it. This runs continuously inside a
while(1)
loop. - Kernel 3 waits for a flag from the host (CPU). The CPU sets the flag, and Kernel 3 reads it and prints the value.
Compilation Command:
I am compiling using:
nvc++ -cuda test_app.cu
(I use nvc++
because I plan to integrate OpenACC functions later.)
Issues Observed:
- No kernel print output – The expected print statements from the kernel threads do not appear.
- Host-side
cudaMemcpy
gets stuck – This happens when setting the flag for Kernel 3. - High GPU utilization – Running
tegrastats
shows:
GR3D_Freq 99%@675
Does this mean all SMs of the GPU are fully utilized? My code only has a few threads (2-3 at most), so I expected much lower usage.
Questions:
- Could you please help me identify any mistakes in my implementation?
- Why is
cudaMemcpy
getting stuck? - Why is my GPU showing high utilization despite running very few threads?
- How can I optimize my implementation to ensure proper synchronization?
Here is the test code I am using:
include < iostream>
include <cuda_runtime.h>
include <unistd.h>global void kernel1(volatile unsigned int* flag1) {
while (true) {
printf(“Kernel1: Setting flag1.\n”);
atomicExch((unsigned int*)flag1, 1);
__threadfence(); // Ensure visibility to other threads
__nanosleep(10000000); // Sleep for 10ms
}
}global void kernel2(volatile unsigned int* flag1) {
while (true) {
if (atomicCAS((unsigned int*)flag1, 1, 0) == 1) { // Only clear if set
printf(“Kernel2: Received flag1. Clearing flag1.\n”);
__threadfence();
}
__nanosleep(10000000); // Sleep for 10ms
}
}global void kernel3(volatile unsigned int* flag2) {
while (true) {
if (atomicCAS((unsigned int*)flag2, 1, 0) == 1) { // Check and clear flag
printf(“Kernel3: Received flag2 from host. Clearing flag2.\n”);
__threadfence();
}
__nanosleep(10000000); // Sleep for 10ms
}
}int main() {
unsigned int h_flag2 = 0; // Host flag
unsigned int *d_flag1, *d_flag2;cudaMalloc((void**)&d_flag1, sizeof(unsigned int)); cudaMalloc((void**)&d_flag2, sizeof(unsigned int)); cudaMemset(d_flag1, 0, sizeof(unsigned int)); cudaMemset(d_flag2, 0, sizeof(unsigned int)); // Launch single-threaded kernels kernel1<<<1,1>>>(d_flag1); kernel2<<<1,1>>>(d_flag1); kernel3<<<1,1>>>(d_flag2); sleep(2); // Allow kernels to start printf("Host: Setting flag2 for Kernel3.\n"); h_flag2 = 1; cudaMemcpy(d_flag2, &h_flag2, sizeof(unsigned int), cudaMemcpyHostToDevice);
printf(“Host: Flag2 Set for Kernel3.\n”);
sleep(1);cudaFree(d_flag1); cudaFree(d_flag2); return 0;
}