No printf(".") output from the kernel

I am running 496 threads in the 1 block. Each thread is running in a loop where n is about 100E9.

I would like to see the progress by adding print command:

if (threadIdx.x == 0 && n % 1000000ull == 0) {
  printf(".");
}

It seems that there is not print output to the console until the program finishes. Is there any way how to force the print command to output data immediately? Another option would be to use keyboard shortcut to close the program and let him flush the buffer to the console.

printf buffer flushing occurs as per conditions outlined here.

1 Like

If it is a kernel progress report you want, this may also be of interest.

7.33.2. Limitations states:
The output buffer for printf() is set to a fixed size before kernel launch (see Associated Host-Side API). It is circular and if more output is produced during kernel execution than can fit in the buffer, older output is overwritten. It is flushed only when one of these actions is performed:

  • Kernel launch via <<<>>> or cuLaunchKernel() (at the start of the launch, and if the CUDA_LAUNCH_BLOCKING environment variable is set to 1, at the end of the launch as well),
  • Synchronization via cudaDeviceSynchronize(), cuCtxSynchronize(), cudaStreamSynchronize(), cuStreamSynchronize(), cudaEventSynchronize(), or cuEventSynchronize(),
  • Memory copies via any blocking version of cudaMemcpy*() or cuMemcpy*(),
  • Module loading/unloading via cuModuleLoad() or cuModuleUnload(),
  • Context destruction via cudaDeviceReset() or cuCtxDestroy().
  • Prior to executing a stream callback added by cudaStreamAddCallback or cuStreamAddCallback.

What to concretely use in my case?

What that list tells you is that you cannot use printing dots via device-side printf() for a progress report, because the device-side printf-buffer is not continuously copied over to the host while a kernel is in progress. Specific synchronizing activity initiated by the host (enumerated above) is required for the buffer to be copied from device to host.

If you desire a progress report from a running kernel, follow the link to a worked example of how to accomplish that in @Robert_Crovella’s post above.

Unfortunately, the example doesn’t work (VS22). It shows just 100% when the program finished:

kernel starting
progress check finished
h_data = 10
matrix multiply kernel starting
Progress:
percent complete = 100.0

matrix multiply finished. elapsed time = 6877.415039 milliseconds

Programm:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <time.h>

#define TIME_INC 100000000
#define INCS 10
#define USE_PROGRESS 1
#define MAT_DIMX 4000
#define MAT_DIMY MAT_DIMX

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

__global__ void mykernel(volatile int* data) {

    unsigned long time;
    for (int i = 0; i < INCS; i++) {
        atomicAdd((int*)data, 1);
        __threadfence_system();
        time = clock64();
        while ((clock64() - time) < TIME_INC) {};
    }
    printf("progress check finished\n");
}

__global__ void matmult(float* a, float* b, float* c, unsigned int rowA, unsigned int colA, unsigned int colB, volatile int* progress) {
    unsigned int row = threadIdx.x + blockDim.x * blockIdx.x;
    unsigned int col = threadIdx.y + blockDim.y * blockIdx.y;
    if ((row < rowA) && (col < colB)) {
        float temp = 0.0f;
        for (unsigned int k = 0; k < colA; k++)
            temp += a[(row * colA) + k] * b[(k * colB) + col];
        c[(row * colB) + col] = temp;
#if USE_PROGRESS
        if (!(threadIdx.x || threadIdx.y)) {
            atomicAdd((int*)progress, 1);
            __threadfence_system();
        }
#endif
    }
}

int main() {
    // simple test to demonstrate reading progress data from kernel
    volatile int* d_data, * h_data;
    cudaSetDeviceFlags(cudaDeviceMapHost);
    cudaCheckErrors("cudaSetDeviceFlags error");
    cudaHostAlloc((void**)&h_data, sizeof(int), cudaHostAllocMapped);
    cudaCheckErrors("cudaHostAlloc error");
    cudaHostGetDevicePointer((int**)&d_data, (int*)h_data, 0);
    cudaCheckErrors("cudaHostGetDevicePointer error");
    *h_data = 0;
    printf("kernel starting\n");
    mykernel << <1, 1 >> > (d_data);
    cudaCheckErrors("kernel fail");
    int value = 0;
    do {
        int value1 = *h_data;
        if (value1 > value) {
            printf("h_data = %d\n", value1);
            value = value1;
        }
    } while (value < (INCS - 1));
    cudaDeviceSynchronize();
    cudaCheckErrors("kernel fail 2");

    // now try matrix multiply with progress

    float* h_c, * d_a, * d_b, * d_c;
    h_c = (float*)malloc(MAT_DIMX * MAT_DIMY * sizeof(float));
    if (h_c == NULL) { printf("malloc fail\n"); return 1; }
    cudaMalloc((void**)&d_a, MAT_DIMX * MAT_DIMY * sizeof(float));
    cudaCheckErrors("cudaMalloc a fail");
    cudaMalloc((void**)&d_b, MAT_DIMX * MAT_DIMY * sizeof(float));
    cudaCheckErrors("cudaMalloc b fail");
    cudaMalloc((void**)&d_c, MAT_DIMX * MAT_DIMY * sizeof(float));
    cudaCheckErrors("cudaMalloc c fail");

    for (int i = 0; i < MAT_DIMX * MAT_DIMY; i++) h_c[i] = rand() / (float)RAND_MAX;
    cudaMemcpy(d_a, h_c, MAT_DIMX * MAT_DIMY * sizeof(float), cudaMemcpyHostToDevice);
    cudaCheckErrors("cudaMemcpy a fail");
    cudaMemcpy(d_b, h_c, MAT_DIMX * MAT_DIMY * sizeof(float), cudaMemcpyHostToDevice);
    cudaCheckErrors("cudaMemcpy b fail");

    cudaEvent_t start, stop;
    cudaEventCreate(&start); cudaEventCreate(&stop);
    *h_data = 0;
    dim3 block(16, 16);
    dim3 grid(((MAT_DIMX + block.x - 1) / block.x), ((MAT_DIMY + block.y - 1) / block.y));
    printf("matrix multiply kernel starting\n");
    cudaEventRecord(start);
    matmult << <grid, block >> > (d_a, d_b, d_c, MAT_DIMY, MAT_DIMX, MAT_DIMX, d_data);
    cudaEventRecord(stop);
#if USE_PROGRESS
    unsigned int num_blocks = grid.x * grid.y;
    float my_progress = 0.0f;
    value = 0;
    printf("Progress:\n");
    do {
        cudaEventQuery(stop);  // may help WDDM scenario
        int value1 = *h_data;
        float kern_progress = (float)value1 / (float)num_blocks;
        if ((kern_progress - my_progress) > 0.1f) {
            printf("percent complete = %2.1f\n", (kern_progress * 100));
            my_progress = kern_progress;
        }
    } while (my_progress < 0.9f);
    printf("\n");
#endif
    cudaEventSynchronize(stop);
    cudaCheckErrors("event sync fail");
    float et;
    cudaEventElapsedTime(&et, start, stop);
    cudaCheckErrors("event elapsed time fail");
    cudaDeviceSynchronize();
    cudaCheckErrors("mat mult kernel fail");
    printf("matrix multiply finished.  elapsed time = %f milliseconds\n", et);


    return 0;
}

try enabling windows hardware scheduling.