When cudaFree() will be called

#include <cstdlib>
#include <iostream>
#include <cublas_v2.h>
#include <random>

// Multiply the arrays A and B on GPU and save the result in C
// C(m,n) = A(m,k) * B(k,n)
cublasStatus_t gpu_blas_mmul(cublasHandle_t handle, const float *A, const float *B, float *C, const int m, const int k, const int n)
{
	int lda = m, ldb = k, ldc = m;
	const float alf = 1;
	const float bet = 0;
	const float *alpha = &alf;
	const float *beta = &bet;

	// Do the actual multiplication 
	return cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
}


void CPU_fill_rand(float *A, int size)
{
	std::random_device rd;
	std::default_random_engine eng(rd());
	std::uniform_real_distribution<float> distr(1.0, 2.0);

	for (size_t r = 0; r < size; ++r)
		A[r] = distr(eng);
}

int main(int argc, char* argv[])
{

	int cin_1 = 256;
	int cin_2 = 256;
	int cout_1 = 256;
	int cout_2 = 256;
	int imgsz = 56;

	// output(d_C) = weight_gemm(d_A) * input_gemm(d_B)
	int m1 = cout_1, m2 = cout_2;
	int k1 = cin_1 * 3 * 3, k2 = cin_2 * 3 * 3;
	int n1 = imgsz * imgsz, n2 = imgsz * imgsz;

	float *h_A1 = (float *)malloc(m1 * k1 * sizeof(float));
	float *h_B1 = (float *)malloc(k1 * n1 * sizeof(float));
	float *h_C1 = (float *)malloc(m1 * n1 * sizeof(float));
	float *h_A2 = (float *)malloc(m2 * k2 * sizeof(float));
	float *h_B2 = (float *)malloc(k2 * n2 * sizeof(float));
	float *h_C2 = (float *)malloc(m2 * n2 * sizeof(float));

	float *d_A1, *d_B1, *d_C1, *d_A2, *d_B2, *d_C2;
	cudaMalloc(&d_A1, m1 * k1 * sizeof(float));
	cudaMalloc(&d_B1, k1 * n1 * sizeof(float));
	cudaMalloc(&d_C1, m1 * n1 * sizeof(float));
	cudaMalloc(&d_A2, m2 * k2 * sizeof(float));
	cudaMalloc(&d_B2, k2 * n2 * sizeof(float));
	cudaMalloc(&d_C2, m2 * n2 * sizeof(float));

	// Fill the arrays on CPU with random numbers
	CPU_fill_rand(h_A1, m1 * k1);
	CPU_fill_rand(h_B1, k1 * n1);
	CPU_fill_rand(h_A2, m2 * k2);
	CPU_fill_rand(h_B2, k2 * n2);

	cudaMemcpy(d_A1, h_A1, m1 * k1 * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(d_B1, h_B1, k1 * n1 * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(d_A2, h_A2, m2 * k2 * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(d_B2, h_B2, k2 * n2 * sizeof(float), cudaMemcpyHostToDevice);

	cublasHandle_t handle;
	cublasCreate(&handle);
	
	cublasStatus_t stats1 = **gpu_blas_mmul(**handle, d_A1, d_B1, d_C1, m1, k1, n1);
	cublasStatus_t stats2 = **gpu_blas_mmul**(handle, d_A2, d_B2, d_C2, m2, k2, n2);
	if (stats1 != CUBLAS_STATUS_SUCCESS || stats2 != CUBLAS_STATUS_SUCCESS) {
		std::cout << "kernel1 : " << stats1 << std::endl;
		std::cout << "kernel1 : " << stats2 << std::endl;
		exit(0);
	}

	cudaDeviceSynchronize();
	cudaMemcpy(h_C1, d_C1, m1 * n1 * sizeof(float), cudaMemcpyDeviceToHost);
	cudaMemcpy(h_C2, d_C2, m2 * n2 * sizeof(float), cudaMemcpyDeviceToHost);
	cublasDestroy(handle);

	// Free GPU and CPU memory
	cudaFree(d_A1);
	cudaFree(d_B1);
	cudaFree(d_C1);
	cudaFree(d_A2);
	cudaFree(d_B2);
	cudaFree(d_C2);

	free(h_A1);
	free(h_B1);
	free(h_C1);
	free(h_A2);
	free(h_B2);
	free(h_C2);
	return 0;
}

When I profiled my program using nsys tool, I found some confusing phenomenon . As shown in above, I began to execute kernel calling gpu_blas_mmul function after finishing cudaMemcpy and no any free memory operations. However, nsight system shows that after finishing memory copy of d_A1, d_A2, d_B1, d_B2, cudaFree was called and lasted for a long time. And then there were some memery copy operations again. Lastly, seem kernel began to execute. I was very confused why cudaFree was called before kernel execution or how CUDA managed memory ?

This is part of the handle creation. This can be proven with Nsight Systems and NVTX markers. See edits in code below.

#include <cstdlib>
#include <cublas_v2.h>
#include <iostream>
#include <random>

#define USE_NVTX

// ***************** FOR NVTX MARKERS *******************
#ifdef USE_NVTX
#include "nvtx3/nvToolsExt.h"

const uint32_t colors[] = {0xff00ff00, 0xff0000ff, 0xffffff00, 0xffff00ff,
                           0xff00ffff, 0xffff0000, 0xffffffff};
const int num_colors = sizeof(colors) / sizeof(uint32_t);

#define PUSH_RANGE(name, cid)                                                                      \
    {                                                                                              \
        int color_id = cid;                                                                        \
        color_id = color_id % num_colors;                                                          \
        nvtxEventAttributes_t eventAttrib = {0};                                                   \
        eventAttrib.version = NVTX_VERSION;                                                        \
        eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;                                          \
        eventAttrib.colorType = NVTX_COLOR_ARGB;                                                   \
        eventAttrib.color = colors[color_id];                                                      \
        eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;                                         \
        eventAttrib.message.ascii = name;                                                          \
        nvtxRangePushEx(&eventAttrib);                                                             \
    }
#define POP_RANGE() nvtxRangePop();
#else
#define PUSH_RANGE(name, cid)
#define POP_RANGE()
#endif
// ***************** FOR NVTX MARKERS *******************

// Multiply the arrays A and B on GPU and save the result in C
// C(m,n) = A(m,k) * B(k,n)
cublasStatus_t gpu_blas_mmul(cublasHandle_t handle, const float *A, const float *B, float *C,
                             const int m, const int k, const int n) {
    int lda = m, ldb = k, ldc = m;
    const float alf = 1;
    const float bet = 0;
    const float *alpha = &alf;
    const float *beta = &bet;

    // Do the actual multiplication
    return cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, A, lda, B, ldb, beta, C,
                       ldc);
}

void CPU_fill_rand(float *A, int size) {
    std::random_device rd;
    std::default_random_engine eng(rd());
    std::uniform_real_distribution<float> distr(1.0, 2.0);

    for (size_t r = 0; r < size; ++r)
        A[r] = distr(eng);
}

int main(int argc, char *argv[]) {

    int cin_1 = 256;
    int cin_2 = 256;
    int cout_1 = 256;
    int cout_2 = 256;
    int imgsz = 56;

    // output(d_C) = weight_gemm(d_A) * input_gemm(d_B)
    int m1 = cout_1, m2 = cout_2;
    int k1 = cin_1 * 3 * 3, k2 = cin_2 * 3 * 3;
    int n1 = imgsz * imgsz, n2 = imgsz * imgsz;

    float *h_A1 = (float *)malloc(m1 * k1 * sizeof(float));
    float *h_B1 = (float *)malloc(k1 * n1 * sizeof(float));
    float *h_C1 = (float *)malloc(m1 * n1 * sizeof(float));
    float *h_A2 = (float *)malloc(m2 * k2 * sizeof(float));
    float *h_B2 = (float *)malloc(k2 * n2 * sizeof(float));
    float *h_C2 = (float *)malloc(m2 * n2 * sizeof(float));

    float *d_A1, *d_B1, *d_C1, *d_A2, *d_B2, *d_C2;
    cudaMalloc(&d_A1, m1 * k1 * sizeof(float));
    cudaMalloc(&d_B1, k1 * n1 * sizeof(float));
    cudaMalloc(&d_C1, m1 * n1 * sizeof(float));
    cudaMalloc(&d_A2, m2 * k2 * sizeof(float));
    cudaMalloc(&d_B2, k2 * n2 * sizeof(float));
    cudaMalloc(&d_C2, m2 * n2 * sizeof(float));

    // Fill the arrays on CPU with random numbers
    CPU_fill_rand(h_A1, m1 * k1);
    CPU_fill_rand(h_B1, k1 * n1);
    CPU_fill_rand(h_A2, m2 * k2);
    CPU_fill_rand(h_B2, k2 * n2);

    cudaMemcpy(d_A1, h_A1, m1 * k1 * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B1, h_B1, k1 * n1 * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_A2, h_A2, m2 * k2 * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B2, h_B2, k2 * n2 * sizeof(float), cudaMemcpyHostToDevice);

    cublasHandle_t handle;
    PUSH_RANGE("cublasCreate", 0)
    cublasCreate(&handle);
    POP_RANGE()

    cublasStatus_t stats1 = gpu_blas_mmul(handle, d_A1, d_B1, d_C1, m1, k1, n1);
    cublasStatus_t stats2 = gpu_blas_mmul(handle, d_A2, d_B2, d_C2, m2, k2, n2);
    if (stats1 != CUBLAS_STATUS_SUCCESS || stats2 != CUBLAS_STATUS_SUCCESS) {
        std::cout << "kernel1 : " << stats1 << std::endl;
        std::cout << "kernel1 : " << stats2 << std::endl;
        exit(0);
    }

    cudaDeviceSynchronize();
    cudaMemcpy(h_C1, d_C1, m1 * n1 * sizeof(float), cudaMemcpyDeviceToHost);
    cudaMemcpy(h_C2, d_C2, m2 * n2 * sizeof(float), cudaMemcpyDeviceToHost);
    cublasDestroy(handle);

    // Free GPU and CPU memory
    cudaFree(d_A1);
    cudaFree(d_B1);
    cudaFree(d_C1);
    cudaFree(d_A2);
    cudaFree(d_B2);
    cudaFree(d_C2);

    free(h_A1);
    free(h_B1);
    free(h_C1);
    free(h_A2);
    free(h_B2);
    free(h_C2);
    return 0;
}

Thanks so much

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.