Cuda allocate device memory failed

My device is RTX 2080 Ti and I try to write Cuda program on this device. However, when I try to allocate memory from the device, I always get the “out of memory” error from a specific command. I make sure that I have enough GPU memory and the most strange thing is that I can allocate more GPU memory for other points only except that. My code is as the following.

float caltrace_gpu(float* W, float* WTWInv, float * data, int N, long long M,
                int C, int num_block, float* mean, float* rv,  cublasHandle_t handle)
{
    // Set cuda context
    cudaError_t cudaStat;
    cublasStatus_t stat;
    
    // Initialize device pointer
    float* d_data;
    float* d_W;
    float* d_WTWInv;
    float* d_v;
    float* d_result;
    float* d_result2;
    float* d_temp;
    float* d_one;
    float* d_mean;
    float prod;
    float* d_cby1;
    float* d_cby2;
    float* d_cby3;
    float* d_R;
    float* d_data_final;
    float* d_temp_final;
    //Initialize one vector in order to perform element wise sub
    float* vector_ones = new float[N];
    for (long long i = 0; i < N; i++)
    {
        vector_ones[i] = 1.0f;
    }
    // The coefficient needed in computation
    float al = 1.0f;
    float al_n = -1.0f;
    float beta = 0.0f;
    
    // settings for the blocks
    int P = num_block;
    int SNIPs = static_cast<int>(M / P + 1);
    int SNIPs_final = M - SNIPs * (P - 1);
    
    // Allocate memeory on device
    cudaStat = cudaMalloc((void**)&d_data, SNIPs * N * sizeof(float));
    if (cudaStat != cudaSuccess) {
        cout << "device memory allocation failed 1" << endl;
        return EXIT_FAILURE;
    }
    cudaStat = cudaMalloc((void**)&d_W, N * C * sizeof(float));
    if (cudaStat != cudaSuccess) {
        cout << "device memory allocation failed 2" << endl;
        return EXIT_FAILURE;
    }
    cudaStat = cudaMalloc((void**)&d_WTWInv, C * C * sizeof(float));
    if (cudaStat != cudaSuccess) {
        cout << "device memory allocation failed 3" << endl;
        return EXIT_FAILURE;
    }
    cudaStat = cudaMalloc((void**)&d_v, N * sizeof(float));
    if (cudaStat != cudaSuccess) {
        cout << "device memory allocation failed 4" << endl;
        return EXIT_FAILURE;
    }
    cudaStat = cudaMalloc((void**)&d_result2, N * sizeof(float));
    if (cudaStat != cudaSuccess) {
        cout << "device memory allocation failed 5" << endl;
        return EXIT_FAILURE;
    }
    cudaStat = cudaMalloc((void**)&d_result, N * sizeof(float));
    if (cudaStat != cudaSuccess) {
        cout << "device memory allocation failed 6" << endl;
        return EXIT_FAILURE;
    }
    cudaStat = cudaMalloc((void**)&d_temp, SNIPs * sizeof(float));
    if (cudaStat != cudaSuccess) {
        cout << "device memory allocation failed 7" << endl;
        return EXIT_FAILURE;
    }
    cudaStat = cudaMalloc((void**)&d_one, N * sizeof(float));
    if (cudaStat != cudaSuccess) {
        cout << "device memory allocation failed 8" << endl;
        return EXIT_FAILURE;
    }
    // cudaStat = cudaMalloc((void**)&d_mean, SNIPs * sizeof(float));
    cudaStat = cudaMalloc((void**)&d_cby1, C * sizeof(float));
    if (cudaStat != cudaSuccess) {
        cout << "device memory allocation failed 9" << endl;
        return EXIT_FAILURE;
    }
    cudaStat = cudaMalloc((void**)&d_cby2, C * sizeof(float));
    if (cudaStat != cudaSuccess) {
        cout << "device memory allocation failed 10" << endl;
        return EXIT_FAILURE;
    }
    cudaStat = cudaMalloc((void**)&d_cby3, N * sizeof(float));
    if (cudaStat != cudaSuccess) {
        cout << "device memory allocation failed 11" << endl;
        return EXIT_FAILURE;
    }

I always fail to allocate memory for d_WTWInv. However, when I delete this command, I can successfully allocate memory for the latter pointers. The C is equal to 20.