MultiHeadAttnForward Result


I’m trying to use the “MultiHeadAttentionForward” function, I had some difficulties with “BAD_PARAM” however this is now a thing of the past (at least I hope so).
My current problem comes from the result that the forward returns, indeed this function returns a complete array of 0, which I think is not the right result.

Is it my use and the parameters I pass that are incorrect?

Here is my code:

#include <iomanip>
#include <iostream>
#include <cstdlib>
#include <vector>

#include <cuda_runtime.h>
#include <cudnn.h>

#define CUDA_CALL(f) { \
  cudaError_t err = (f); \
  if (err != cudaSuccess) { \
    std::cout \
        << "    Error occurred: " << err << std::endl; \
    std::exit(1); \
  } \

#define CUDNN_CALL(f) { \
  cudnnStatus_t err = (f); \
  if (err != CUDNN_STATUS_SUCCESS) { \
    std::cout \
        << "    Error occurred: " << err << std::endl; \
    std::exit(1); \
  } \

void print(const float* data, int n, int c, int h) {
    std::vector<float> buffer(1 << 20);
    CUDA_CALL(cudaMemcpy(, data,
        n * c * h * sizeof(float),
    int a = 0;
    for (int i = 0; i < n; ++i) {
        std::cout << "n=" << i << std::endl;
        for (int j = 0; j < c; ++j) {
            for (int k = 0; k < h; ++k) {

                std::cout << std::setw(4) << std::right << buffer[a];
                std::cout << std::endl;
    std::cout << std::endl;

int* FillSameValue(int nb_elt, int fill_value)
    int* seqLengthArray = (int*)malloc(nb_elt * sizeof(int));
    for (int i = 0; i < nb_elt; i++)
        seqLengthArray[i] = fill_value;
    return seqLengthArray;

int main()
    srand(time(NULL)); //Init random

    int N = 32,
        B = 1,
        H = 8,
        T = 10,
        Tk = 10,
        Cq = 15,
        Ck = 15,
        Cv = 15,
        Cq_ = 15,
        Cv_ = 15,
        Co_ = 15;


    cudnnHandle_t cudnn;

    //----------------------------------------------------------------Create Descriptors-----------------------------------------------------------------------------//

    // Creation of SeqDataDescriptor for Queries, Values, Keys and Residual
    cudnnSeqDataDescriptor_t QueriesSeqDataDesc;

    cudnnSeqDataDescriptor_t ValuesSeqDataDesc;

    cudnnSeqDataDescriptor_t KeysSeqDataDesc;

    cudnnSeqDataDescriptor_t OutputSeqDataDesc;

    cudnnSeqDataDescriptor_t ResidualSeqDataDesc = NULL; // no residual connections

    // Creation of the Attention Descriptor
    cudnnAttnDescriptor_t AttentionDesc;

    //Creation of the attn/post Dropout Descriptor
    cudnnDropoutDescriptor_t AttnDropoutDesc;

    cudnnDropoutDescriptor_t PostDropoutDesc;

    std::cout << "End Create Descriptors without Error" << std::endl;

    //----------------------------------------------------------------Data Malloc and MemCpy-------------------------------------------------------------------------//

    // Create and fill (cos(x)) an array that will be our source data for every cuda ptr
    int nb_inp_elt = N * T * Co_;
    size_t inp_size = nb_inp_elt * sizeof(float);
    float* data_src = (float*)malloc(inp_size);

    for (int i = 0; i < nb_inp_elt; i++)
        data_src[i] = (float)cos(i);

    // Malloc and Memcpy queries_data/values_data/keys_data
    float* queries_data;
    CUDA_CALL(cudaMalloc(&queries_data, inp_size));
    CUDA_CALL(cudaMemcpy(queries_data, data_src, inp_size, cudaMemcpyHostToDevice));


    float* values_data;
    CUDA_CALL(cudaMalloc(&values_data, inp_size));
    CUDA_CALL(cudaMemcpy(values_data, data_src, inp_size, cudaMemcpyHostToDevice));


    float* keys_data;
    CUDA_CALL(cudaMalloc(&keys_data, inp_size));
    CUDA_CALL(cudaMemcpy(keys_data, data_src, inp_size, cudaMemcpyHostToDevice));

    // Malloc and Memcpy out_data
    float* out_data;
    CUDA_CALL(cudaMalloc(&out_data, inp_size));
    //CUDA_CALL(cudaMemcpy(out_data, out_data_src, out_size, cudaMemcpyHostToDevice));

    std::cout << "End Data Malloc and MemCpy without Error" << std::endl;

    //-------------------------------------------------------------------Set Descriptors-----------------------------------------------------------------------------//

    // Set Dropout Descriptor
    // First Dropout States Size and Malloc it
    size_t states_size;
    CUDNN_CALL(cudnnDropoutGetStatesSize(cudnn, &states_size));

    // Attn States
    float* attn_states;
    CUDA_CALL(cudaMalloc(&attn_states, states_size));

    CUDNN_CALL(cudnnSetDropoutDescriptor(AttnDropoutDesc, // cudnnDropoutDescriptor_t
        cudnn, // cudnnHandle_t
        0.5, // dropout
        attn_states, //*states
        states_size, // stateSizeInBytes
        0)); //seed

    // Post States
    float* post_states;
    CUDA_CALL(cudaMalloc(&post_states, states_size));

    CUDNN_CALL(cudnnSetDropoutDescriptor(PostDropoutDesc, // cudnnDropoutDescriptor_t
        cudnn, // cudnnHandle_t
        0.5, // dropout
        post_states, //*states
        states_size, // stateSizeInBytes
        0)); //seed

    // Set Seq Data Descriptor (Queries, Values, Keys)

    auto make_seq_desc = [&](int N, int B, int T, int C, const int* seqLengthArray, size_t seqLengthArraySize) {
        cudnnSeqDataDescriptor_t desc;
        int dimA[CUDNN_SEQDATA_DIM_COUNT];
        cudnnSeqDataAxis_t axes[CUDNN_SEQDATA_DIM_COUNT] = {
        CUDNN_CALL(cudnnSetSeqDataDescriptor(desc, CUDNN_DATA_FLOAT, 4, dimA, axes, seqLengthArraySize, seqLengthArray, NULL));
        return desc;

    QueriesSeqDataDesc = make_seq_desc(N, B, T, Cq, FillSameValue(N * B, T), N*B);
    KeysSeqDataDesc = make_seq_desc(N, 1, Tk, Ck, FillSameValue(N, Tk), N);
    ValuesSeqDataDesc = make_seq_desc(N, 1, Tk, Cv, FillSameValue(N, Tk), N);
    OutputSeqDataDesc = make_seq_desc(N, B, T, Co_, FillSameValue(N*B, T), N*B);

    // Set Attention Descriptor

    CUDNN_CALL(cudnnSetAttnDescriptor(AttentionDesc, // cudnnAttnDescriptor_t
        H, // nHeads
        0.1, //smScaler
        CUDNN_DATA_FLOAT, //cudnnDataType_t
        CUDNN_DATA_FLOAT, //cudnnDataType_t
        CUDNN_DEFAULT_MATH, //cudnnMathType_t
        NULL, //cudnnDropoutDescriptor_t
        NULL, //cudnnDropoutDescriptor_t
        Cq, //qSize
        Ck, //kSize
        Cv, //vSize
        Cq_, //qProjSize
        Cq_, //kProSize
        Cv_, //vProjSize
        Co_, //oProjSize
        T, //qoMaxSeqLength
        Tk, //kvMaxSeqLength
        N, //maxBatchSize
        B)); //maxBeamSize

    std::cout << "End Set Descriptor without Error" << std::endl;

    //-----------------------------------------------------------Get Multi Buffer and Alloc--------------------------------------------------------------------------//

    // Get Multi Buffer 
    size_t weightspacesize;
    size_t workspacesize;
    size_t reservespacesize;

    CUDNN_CALL(cudnnGetMultiHeadAttnBuffers(cudnn, // cudnnHandle_t 
        AttentionDesc, // cudnnAttnDescriptor_t 
        &weightspacesize, // size_t*  
        &workspacesize, //size_t* 
        &reservespacesize)); //size_t*

    float* weight = NULL;
    float* workspace = NULL;
    float* reserve = NULL;

    float* data = (float*)malloc(weightspacesize);
    for (int i = 0; i < (int)weightspacesize/sizeof(float); i++)
        data[i] = rand();


    CUDA_CALL(cudaMalloc(&workspace, workspacesize));
    CUDA_CALL(cudaMalloc(&reserve, reservespacesize));
    CUDA_CALL(cudaMalloc(&weight, weightspacesize));


    //CUDA_CALL(cudaMemcpy(weight, data, weightspacesize, cudaMemcpyHostToDevice));

    std::cout << "End Get Multi Buffer without Error" << std::endl;

    //----------------------------------------------------------------------ATTN FORWARD------------------------------------------------------------------------------//

    int* devSeqLengthsQ0Host = FillSameValue(N, T);
    int* devSeqLengthsKVHost = FillSameValue(N, Tk);
    int* devSeqLengthsQ0Dev;
    int* devSeqLengthsKVDev;
    CUDA_CALL(cudaMalloc(&devSeqLengthsQ0Dev, N * B * sizeof(int)));
    CUDA_CALL(cudaMemcpy(devSeqLengthsQ0Dev, devSeqLengthsQ0Host, N * B * sizeof(int), cudaMemcpyHostToDevice));
    CUDA_CALL(cudaMalloc(&devSeqLengthsKVDev, N * sizeof(int)));
    CUDA_CALL(cudaMemcpy(devSeqLengthsKVDev, devSeqLengthsKVHost, N * sizeof(int), cudaMemcpyHostToDevice));

    CUDNN_CALL(cudnnMultiHeadAttnForward(cudnn, // cudnnHandle_t
        AttentionDesc, //cudnnAttnDescriptor_t
        -1, //currIdx
        FillSameValue(T, 0), //loWinIdx[]
        FillSameValue(T, Tk), //hiWinIdx[]
        devSeqLengthsQ0Dev, //devSeqLengthsQO[]
        devSeqLengthsKVDev, //devSeqLengthsKV[]
        QueriesSeqDataDesc, //cudnnSeqDataDescriptor_t
        queries_data, //*queries
        NULL, //*residuals
        KeysSeqDataDesc, //cudnnSeqDataDescriptor_t
        keys_data, //*keys
        ValuesSeqDataDesc, //cudnnSeqDataDescriptor_t
        values_data, //*values
        OutputSeqDataDesc, //cudnnSeqDataDescriptor_t
        out_data, //*out
        weightspacesize, //size_t
        weight, //*weights
        workspacesize, //size_t
        workspace, //*workSpace
        reservespacesize, //size_t
        reserve)); //*reserveSpace
    std::cout << "multi_head_input" << std::endl;
    print(queries_data, 1, T, Co_); // Display queries_data [0,:,:]

    std::cout << "multi_head_output" << std::endl;

    print(out_data, 1, T, Co_); // Display out_data [0,:,:]

} (11.9 KB)

Cuda Version: 11.7
Graphics card: NVIDIA GeForce MX250
CuDNN Version: 8.4.1


Could you please share with us complete error logs for better debugging?

Thank you.


Thank you for your answer, however I have no particular error, I suspect using the function incorrectly since my result ends up being an array of 0.
My question is: “Is it my parameters that are incorrect? How could I modify my code (available in my first post) to get better results?”


Sorry for the delayed response.
Function usage looks correct at first glance. Could you please share the compilation command you’re using. We would like to try from our end for better debugging.

Thank you.


I’m sorry but I don’t know the subject very well.
Currently I use VisualStudio 2022 and I create directly a CUDA project (directly available via the VS environment) to which I add the CuDNN library (cudnn.lib) in the “link editor → input”.
I then add an "#include <cudnn.h> and that’s it.
So I can’t tell you the command line I use since I let VS take care of that part for me.
However, if it is the line written in “CUDA C/C++ → Command Line” here it is:

(Approximate command-line. Settings inherited from host are not visible below.)
(Please see the output window after a build for the full command-line)

Driver API (NVCC Compilation Type is .cubin, .gpu, or .ptx)
set CUDAFE_FLAGS=–sdk_dir “C:\Program Files (x86)\Windows Kits”
“C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin\nvcc.exe” --use-local-env -ccbin “C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.32. 31326\bin\HostX64\x64” -x cu -G --keep-dir x64\Debug -maxrregcount=0 --machine 32 --compile -cudart static -o C:\Users\coco\Desktop\MultiHeadMelangeMultiHeadMelangex64\Debug%(Filename)%(Extension).obj “%(FullPath)”

Runtime API (NVCC Compilation Type is hybrid object or .c file)
set CUDAFE_FLAGS=–sdk_dir "C:\Program Files (x86)\Windows Kits\10"
“C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin\nvcc.exe” --use-local-env -ccbin “C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.32. 31326\bin\HostX64\x64” -x cu -G --keep-dir x64\Debug -maxrregcount=0 --machine 32 --compile -cudart static -g -Xcompiler "/EHsc /nologo /FS /Zi " -o C:\Users\coco\Desktop\MultiHeadMelange\x64\Debug%(Filename)%(Extension). obj “%(FullPath)”

I have sent you a copy of the VS project in .zip. (16.5 MB)