No printf(".") output from the kernel

tiputa · March 15, 2023, 12:01am

I am running 496 threads in the 1 block. Each thread is running in a loop where n is about 100E9.

I would like to see the progress by adding print command:

if (threadIdx.x == 0 && n % 1000000ull == 0) {
  printf(".");
}

It seems that there is not print output to the console until the program finishes. Is there any way how to force the print command to output data immediately? Another option would be to use keyboard shortcut to close the program and let him flush the buffer to the console.

rs277 · March 15, 2023, 12:10am

printf buffer flushing occurs as per conditions outlined here.

Robert_Crovella · March 15, 2023, 12:19am

If it is a kernel progress report you want, this may also be of interest.

tiputa · March 15, 2023, 7:10am

7.33.2. Limitations states:
The output buffer for printf() is set to a fixed size before kernel launch (see Associated Host-Side API). It is circular and if more output is produced during kernel execution than can fit in the buffer, older output is overwritten. It is flushed only when one of these actions is performed:

Kernel launch via <<<>>> or cuLaunchKernel() (at the start of the launch, and if the CUDA_LAUNCH_BLOCKING environment variable is set to 1, at the end of the launch as well),
Synchronization via cudaDeviceSynchronize(), cuCtxSynchronize(), cudaStreamSynchronize(), cuStreamSynchronize(), cudaEventSynchronize(), or cuEventSynchronize(),
Memory copies via any blocking version of cudaMemcpy*() or cuMemcpy*(),
Module loading/unloading via cuModuleLoad() or cuModuleUnload(),
Context destruction via cudaDeviceReset() or cuCtxDestroy().
Prior to executing a stream callback added by cudaStreamAddCallback or cuStreamAddCallback.

What to concretely use in my case?

njuffa · March 15, 2023, 7:38am

What that list tells you is that you cannot use printing dots via device-side printf() for a progress report, because the device-side printf-buffer is not continuously copied over to the host while a kernel is in progress. Specific synchronizing activity initiated by the host (enumerated above) is required for the buffer to be copied from device to host.

If you desire a progress report from a running kernel, follow the link to a worked example of how to accomplish that in @Robert_Crovella’s post above.

tiputa · March 15, 2023, 8:18am

Unfortunately, the example doesn’t work (VS22). It shows just 100% when the program finished:

kernel starting
progress check finished
h_data = 10
matrix multiply kernel starting
Progress:
percent complete = 100.0

matrix multiply finished. elapsed time = 6877.415039 milliseconds

Programm:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <time.h>

#define TIME_INC 100000000
#define INCS 10
#define USE_PROGRESS 1
#define MAT_DIMX 4000
#define MAT_DIMY MAT_DIMX

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

__global__ void mykernel(volatile int* data) {

    unsigned long time;
    for (int i = 0; i < INCS; i++) {
        atomicAdd((int*)data, 1);
        __threadfence_system();
        time = clock64();
        while ((clock64() - time) < TIME_INC) {};
    }
    printf("progress check finished\n");
}

__global__ void matmult(float* a, float* b, float* c, unsigned int rowA, unsigned int colA, unsigned int colB, volatile int* progress) {
    unsigned int row = threadIdx.x + blockDim.x * blockIdx.x;
    unsigned int col = threadIdx.y + blockDim.y * blockIdx.y;
    if ((row < rowA) && (col < colB)) {
        float temp = 0.0f;
        for (unsigned int k = 0; k < colA; k++)
            temp += a[(row * colA) + k] * b[(k * colB) + col];
        c[(row * colB) + col] = temp;
#if USE_PROGRESS
        if (!(threadIdx.x || threadIdx.y)) {
            atomicAdd((int*)progress, 1);
            __threadfence_system();
        }
#endif
    }
}

int main() {
    // simple test to demonstrate reading progress data from kernel
    volatile int* d_data, * h_data;
    cudaSetDeviceFlags(cudaDeviceMapHost);
    cudaCheckErrors("cudaSetDeviceFlags error");
    cudaHostAlloc((void**)&h_data, sizeof(int), cudaHostAllocMapped);
    cudaCheckErrors("cudaHostAlloc error");
    cudaHostGetDevicePointer((int**)&d_data, (int*)h_data, 0);
    cudaCheckErrors("cudaHostGetDevicePointer error");
    *h_data = 0;
    printf("kernel starting\n");
    mykernel << <1, 1 >> > (d_data);
    cudaCheckErrors("kernel fail");
    int value = 0;
    do {
        int value1 = *h_data;
        if (value1 > value) {
            printf("h_data = %d\n", value1);
            value = value1;
        }
    } while (value < (INCS - 1));
    cudaDeviceSynchronize();
    cudaCheckErrors("kernel fail 2");

    // now try matrix multiply with progress

    float* h_c, * d_a, * d_b, * d_c;
    h_c = (float*)malloc(MAT_DIMX * MAT_DIMY * sizeof(float));
    if (h_c == NULL) { printf("malloc fail\n"); return 1; }
    cudaMalloc((void**)&d_a, MAT_DIMX * MAT_DIMY * sizeof(float));
    cudaCheckErrors("cudaMalloc a fail");
    cudaMalloc((void**)&d_b, MAT_DIMX * MAT_DIMY * sizeof(float));
    cudaCheckErrors("cudaMalloc b fail");
    cudaMalloc((void**)&d_c, MAT_DIMX * MAT_DIMY * sizeof(float));
    cudaCheckErrors("cudaMalloc c fail");

    for (int i = 0; i < MAT_DIMX * MAT_DIMY; i++) h_c[i] = rand() / (float)RAND_MAX;
    cudaMemcpy(d_a, h_c, MAT_DIMX * MAT_DIMY * sizeof(float), cudaMemcpyHostToDevice);
    cudaCheckErrors("cudaMemcpy a fail");
    cudaMemcpy(d_b, h_c, MAT_DIMX * MAT_DIMY * sizeof(float), cudaMemcpyHostToDevice);
    cudaCheckErrors("cudaMemcpy b fail");

    cudaEvent_t start, stop;
    cudaEventCreate(&start); cudaEventCreate(&stop);
    *h_data = 0;
    dim3 block(16, 16);
    dim3 grid(((MAT_DIMX + block.x - 1) / block.x), ((MAT_DIMY + block.y - 1) / block.y));
    printf("matrix multiply kernel starting\n");
    cudaEventRecord(start);
    matmult << <grid, block >> > (d_a, d_b, d_c, MAT_DIMY, MAT_DIMX, MAT_DIMX, d_data);
    cudaEventRecord(stop);
#if USE_PROGRESS
    unsigned int num_blocks = grid.x * grid.y;
    float my_progress = 0.0f;
    value = 0;
    printf("Progress:\n");
    do {
        cudaEventQuery(stop);  // may help WDDM scenario
        int value1 = *h_data;
        float kern_progress = (float)value1 / (float)num_blocks;
        if ((kern_progress - my_progress) > 0.1f) {
            printf("percent complete = %2.1f\n", (kern_progress * 100));
            my_progress = kern_progress;
        }
    } while (my_progress < 0.9f);
    printf("\n");
#endif
    cudaEventSynchronize(stop);
    cudaCheckErrors("event sync fail");
    float et;
    cudaEventElapsedTime(&et, start, stop);
    cudaCheckErrors("event elapsed time fail");
    cudaDeviceSynchronize();
    cudaCheckErrors("mat mult kernel fail");
    printf("matrix multiply finished.  elapsed time = %f milliseconds\n", et);


    return 0;
}

Robert_Crovella · March 15, 2023, 2:28pm

try enabling windows hardware scheduling.

Topic		Replies	Views
A simple question about printf() inside a kernel with no convincing answer on google or nvidia docs CUDA Programming and Performance	8	5929	August 4, 2019
Cuda 8.0 - Simple question regarding the use of printf from kernel. CUDA Programming and Performance	2	3664	April 4, 2017
printf problem CUDA Programming and Performance	3	1125	December 5, 2012
Printf isn't working in my kernel? CUDA Programming and Performance	1	7335	January 10, 2011
printf behavior change on driver update CUDA Programming and Performance	0	2303	March 18, 2012
printf not working CUDA Programming and Performance	2	9308	August 17, 2010
is there any limit of printf buffer in a kernel? CUDA Programming and Performance	1	2126	February 12, 2014
Doubts in using printf in kernel code CUDA Programming and Performance	1	941	October 9, 2011
Guidance regarding output processing CUDA Programming and Performance	10	628	October 5, 2020
printf vs cuPrintf in kernels CUDA Programming and Performance	2	5951	February 5, 2013

No printf(".") output from the kernel

Related topics