cufftExecZ2Z in the winapi application runs at different speeds

I am creating a small application for calculating the Fourier transform with a graphical interface. cufftExecZ2Z runs at different speeds in the winapi application. By pressing the button 1 time, the Fourier transform of the 4096x4096 array is performed in 0.017-0.019 seconds, but after 5-6 clicks, the execution time becomes 0.06-0.21 seconds. Is it possible to fix it somehow so that the conversion is always performed in 0.019-0.06 seconds. I tried to make my own two-dimensional Fourier transform, but failed to achieve such temporary results. You can give me some advice. Maybe someone managed to make a qualitatively one-dimensional Fourier transform.

#include <stdlib.h>
#include <stdio.h>
#include <iostream>//iostream>
#include <sstream>//sstream>

#include <windows.h>
#include <cuda_runtime.h>
//#include <cuda_gl_interop.h>
#include <cufft.h>
#include <vector_types.h>




LRESULT CALLBACK WindowProc(HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam);
cufftDoubleComplex* d_data_2d;

int WINAPI wWinMain(_In_ HINSTANCE hInstance,
    _In_opt_ HINSTANCE hPrevInstance,
    _In_ LPWSTR    lpCmdLine,
    _In_ int       nCmdShow)
{
    HINSTANCE HInstance = hInstance;
    BOOL bQuit = FALSE;
    float theta = 0.0f;




    // register window class
    HInstance = hInstance;
    WNDCLASS wc = { 0 };
    wc.lpfnWndProc = WindowProc;
    wc.hInstance = HInstance;
    wc.hbrBackground = (HBRUSH)(COLOR_BACKGROUND);
    wc.lpszClassName = L"MainWindowClass";

    RegisterClass(&wc);

    MSG msg = { 0 };
    HWND hwnd = CreateWindowW(L"MainWindowClass", L"Main Window",
        WS_OVERLAPPEDWINDOW | WS_VISIBLE,
        0, 0, 1080, 800, NULL, NULL, hInstance, NULL);

    HWND hwndButton = CreateWindow(
        L"BUTTON",  // Predefined class; Unicode assumed 
        L"BPF cu",      // Button text 
        WS_TABSTOP | WS_VISIBLE | WS_CHILD | BS_DEFPUSHBUTTON,  // Styles 
        10,         // x position 
        10,         // y position 
        100,        // Button width
        100,        // Button height
        (HWND)hwnd,     // Parent window
        (HMENU)1,       // No menu.
        (HINSTANCE)(hInstance, GWLP_HINSTANCE),
        NULL);      // Pointer not needed.
    HWND childWindow = CreateWindowW(L"STATIC", L"CHILD WINDOW",
        WS_CHILD | WS_BORDER | WS_VISIBLE,
        110, 110, 600, 600, (HWND)hwnd, NULL, hInstance, NULL);
    HWND childWindow1 = CreateWindowW(L"STATIC", L"CHILD WINDOW",
        WS_CHILD | WS_BORDER | WS_VISIBLE,
        710, 10, 20, 20, (HWND)hwnd, NULL, hInstance, NULL);
    HWND childWindow2 = CreateWindowW(L"STATIC", L"CHILD WINDOW",
        WS_CHILD | WS_BORDER | WS_VISIBLE,
        710, 10, 20, 20, (HWND)hwnd, NULL, hInstance, NULL);
    HWND childWindow3 = CreateWindowW(L"STATIC", L"CHILD WINDOW",
        WS_CHILD | WS_BORDER | WS_VISIBLE,
        710, 30, 20, 20, (HWND)hwnd, NULL, hInstance, NULL);
    HWND childWindow4 = CreateWindowW(L"STATIC", L"CHILD WINDOW",
        WS_CHILD | WS_BORDER | WS_VISIBLE,
        710, 50, 20, 20, (HWND)hwnd, NULL, hInstance, NULL);


    ShowWindow(hwnd, nCmdShow);
    UpdateWindow(hwnd);





    // enable OpenGL for the window 
   // EnableOpenGL(hwother, &hDC, &hRC);
    while (GetMessage(&msg, NULL, NULL, NULL))
    {
        TranslateMessage(&msg);
        DispatchMessage(&msg);
    }



}

LRESULT CALLBACK WindowProc(HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam) {
    switch (uMsg) {
    case WM_COMMAND:
        switch (wParam)
        {
        case 1:
            cufftDoubleComplex* h_data_2d;
            int N = 4096;
            h_data_2d = new cufftDoubleComplex[4096 * 4096];
            for (int i = 0; i < N; i++)
            {
                for (int j = 0; j < N; j++)
                {
                    h_data_2d[j + i * N] = make_cuDoubleComplex(j+1, 0.0);
                }
            }
            cudaMalloc((void**)&d_data_2d, N * N * sizeof(cufftDoubleComplex ));
            cudaMemcpy(h_data_2d, d_data_2d, sizeof(cufftDoubleComplex) * N * N, cudaMemcpyHostToDevice);
            cufftHandle plan;
            int n[2] = { N, N };
            cufftPlan2d(&plan, N, N, CUFFT_Z2Z);
            cudaEvent_t start, stop, start1, stop1;
           
            cudaEventCreate(&start);
            cudaEventCreate(&stop);
            cudaEventRecord(start, 0);
//FFT
            cufftExecZ2Z(plan, d_data_2d, d_data_2d, CUFFT_FORWARD);
//FFT
            cudaDeviceSynchronize();


            cudaEventRecord(stop, 0);
            cudaEventSynchronize(stop);
            float gpuTime = 0.0f;
            cudaEventElapsedTime(&gpuTime, start, stop);

            cufftDestroy(plan);
            cudaEventDestroy(start);
            cudaEventDestroy(stop);
            cudaFree(d_data_2d);
            delete(h_data_2d);



            gpuTime = gpuTime / 1000;
            
            std::string test1 = std::to_string(gpuTime);
            test1.append("_seconds");
            MessageBoxA(NULL, test1.c_str(), "time", MB_OK);
            
        }
        break;
    case WM_CLOSE:
        DestroyWindow(hwnd); // Close only the child window
        break;
        //case WM_DESTROY:
        //    PostQuitMessage(1);
        //break;
    case WM_CREATE:
    {

    }

    break;

    default:
        return DefWindowProc(hwnd, uMsg, wParam, lParam);
    }
    return 0;
}

Maybe you can recommend some other gui where such problems will not happen.

Welcome @Kelik1 to the NVIDIA developer forums.

I took the liberty of moving your post to the CUDA programming and performance category, I think you might find help here.

Thanks!

Does the array size change each time you click the button? If not, I suggest moving all your memory management, cufft plan management outside the switch statement into some initialization & destruction functions that only run once through the life of the program.