Hi,
I’m trying to use the “MultiHeadAttentionForward” function, I had some difficulties with “BAD_PARAM” however this is now a thing of the past (at least I hope so).
My current problem comes from the result that the forward returns, indeed this function returns a complete array of 0, which I think is not the right result.
Is it my use and the parameters I pass that are incorrect?
Here is my code:
#include <iomanip>
#include <iostream>
#include <cstdlib>
#include <vector>
#include <cuda_runtime.h>
#include <cudnn.h>
#define CUDA_CALL(f) { \
cudaError_t err = (f); \
if (err != cudaSuccess) { \
std::cout \
<< " Error occurred: " << err << std::endl; \
std::exit(1); \
} \
}
#define CUDNN_CALL(f) { \
cudnnStatus_t err = (f); \
if (err != CUDNN_STATUS_SUCCESS) { \
std::cout \
<< " Error occurred: " << err << std::endl; \
std::exit(1); \
} \
}
void print(const float* data, int n, int c, int h) {
std::vector<float> buffer(1 << 20);
CUDA_CALL(cudaMemcpy(
buffer.data(), data,
n * c * h * sizeof(float),
cudaMemcpyDeviceToHost));
int a = 0;
for (int i = 0; i < n; ++i) {
std::cout << "n=" << i << std::endl;
for (int j = 0; j < c; ++j) {
for (int k = 0; k < h; ++k) {
std::cout << std::setw(4) << std::right << buffer[a];
++a;
}
std::cout << std::endl;
}
}
std::cout << std::endl;
}
int* FillSameValue(int nb_elt, int fill_value)
{
int* seqLengthArray = (int*)malloc(nb_elt * sizeof(int));
for (int i = 0; i < nb_elt; i++)
{
seqLengthArray[i] = fill_value;
}
return seqLengthArray;
}
int main()
{
srand(time(NULL)); //Init random
int N = 32,
B = 1,
H = 8,
T = 10,
Tk = 10,
Cq = 15,
Ck = 15,
Cv = 15,
Cq_ = 15,
Cv_ = 15,
Co_ = 15;
CUDA_CALL(cudaSetDevice(0));
cudnnHandle_t cudnn;
CUDNN_CALL(cudnnCreate(&cudnn));
//---------------------------------------------------------------------------------------------------------------------------------------------------------------//
//----------------------------------------------------------------Create Descriptors-----------------------------------------------------------------------------//
//---------------------------------------------------------------------------------------------------------------------------------------------------------------//
// Creation of SeqDataDescriptor for Queries, Values, Keys and Residual
cudnnSeqDataDescriptor_t QueriesSeqDataDesc;
CUDNN_CALL(cudnnCreateSeqDataDescriptor(&QueriesSeqDataDesc));
cudnnSeqDataDescriptor_t ValuesSeqDataDesc;
CUDNN_CALL(cudnnCreateSeqDataDescriptor(&ValuesSeqDataDesc));
cudnnSeqDataDescriptor_t KeysSeqDataDesc;
CUDNN_CALL(cudnnCreateSeqDataDescriptor(&KeysSeqDataDesc));
cudnnSeqDataDescriptor_t OutputSeqDataDesc;
CUDNN_CALL(cudnnCreateSeqDataDescriptor(&OutputSeqDataDesc));
cudnnSeqDataDescriptor_t ResidualSeqDataDesc = NULL; // no residual connections
// Creation of the Attention Descriptor
cudnnAttnDescriptor_t AttentionDesc;
CUDNN_CALL(cudnnCreateAttnDescriptor(&AttentionDesc));
//Creation of the attn/post Dropout Descriptor
cudnnDropoutDescriptor_t AttnDropoutDesc;
CUDNN_CALL(cudnnCreateDropoutDescriptor(&AttnDropoutDesc));
cudnnDropoutDescriptor_t PostDropoutDesc;
CUDNN_CALL(cudnnCreateDropoutDescriptor(&PostDropoutDesc));
std::cout << "End Create Descriptors without Error" << std::endl;
//---------------------------------------------------------------------------------------------------------------------------------------------------------------//
//----------------------------------------------------------------Data Malloc and MemCpy-------------------------------------------------------------------------//
//---------------------------------------------------------------------------------------------------------------------------------------------------------------//
// Create and fill (cos(x)) an array that will be our source data for every cuda ptr
int nb_inp_elt = N * T * Co_;
size_t inp_size = nb_inp_elt * sizeof(float);
float* data_src = (float*)malloc(inp_size);
for (int i = 0; i < nb_inp_elt; i++)
{
data_src[i] = (float)cos(i);
}
// Malloc and Memcpy queries_data/values_data/keys_data
float* queries_data;
CUDA_CALL(cudaMalloc(&queries_data, inp_size));
CUDA_CALL(cudaMemcpy(queries_data, data_src, inp_size, cudaMemcpyHostToDevice));
float* values_data;
CUDA_CALL(cudaMalloc(&values_data, inp_size));
CUDA_CALL(cudaMemcpy(values_data, data_src, inp_size, cudaMemcpyHostToDevice));
float* keys_data;
CUDA_CALL(cudaMalloc(&keys_data, inp_size));
CUDA_CALL(cudaMemcpy(keys_data, data_src, inp_size, cudaMemcpyHostToDevice));
// Malloc and Memcpy out_data
float* out_data;
CUDA_CALL(cudaMalloc(&out_data, inp_size));
//CUDA_CALL(cudaMemcpy(out_data, out_data_src, out_size, cudaMemcpyHostToDevice));
std::cout << "End Data Malloc and MemCpy without Error" << std::endl;
//---------------------------------------------------------------------------------------------------------------------------------------------------------------//
//-------------------------------------------------------------------Set Descriptors-----------------------------------------------------------------------------//
//---------------------------------------------------------------------------------------------------------------------------------------------------------------//
// Set Dropout Descriptor
// First Dropout States Size and Malloc it
size_t states_size;
CUDNN_CALL(cudnnDropoutGetStatesSize(cudnn, &states_size));
// Attn States
float* attn_states;
CUDA_CALL(cudaMalloc(&attn_states, states_size));
CUDNN_CALL(cudnnSetDropoutDescriptor(AttnDropoutDesc, // cudnnDropoutDescriptor_t
cudnn, // cudnnHandle_t
0.5, // dropout
attn_states, //*states
states_size, // stateSizeInBytes
0)); //seed
// Post States
float* post_states;
CUDA_CALL(cudaMalloc(&post_states, states_size));
CUDNN_CALL(cudnnSetDropoutDescriptor(PostDropoutDesc, // cudnnDropoutDescriptor_t
cudnn, // cudnnHandle_t
0.5, // dropout
post_states, //*states
states_size, // stateSizeInBytes
0)); //seed
// Set Seq Data Descriptor (Queries, Values, Keys)
auto make_seq_desc = [&](int N, int B, int T, int C, const int* seqLengthArray, size_t seqLengthArraySize) {
cudnnSeqDataDescriptor_t desc;
CUDNN_CALL(cudnnCreateSeqDataDescriptor(&desc));
int dimA[CUDNN_SEQDATA_DIM_COUNT];
dimA[CUDNN_SEQDATA_BATCH_DIM] = N;
dimA[CUDNN_SEQDATA_BEAM_DIM] = B;
dimA[CUDNN_SEQDATA_TIME_DIM] = T;
dimA[CUDNN_SEQDATA_VECT_DIM] = C;
cudnnSeqDataAxis_t axes[CUDNN_SEQDATA_DIM_COUNT] = {
CUDNN_SEQDATA_BATCH_DIM,
CUDNN_SEQDATA_BEAM_DIM,
CUDNN_SEQDATA_TIME_DIM,
CUDNN_SEQDATA_VECT_DIM
};
CUDNN_CALL(cudnnSetSeqDataDescriptor(desc, CUDNN_DATA_FLOAT, 4, dimA, axes, seqLengthArraySize, seqLengthArray, NULL));
return desc;
};
QueriesSeqDataDesc = make_seq_desc(N, B, T, Cq, FillSameValue(N * B, T), N*B);
KeysSeqDataDesc = make_seq_desc(N, 1, Tk, Ck, FillSameValue(N, Tk), N);
ValuesSeqDataDesc = make_seq_desc(N, 1, Tk, Cv, FillSameValue(N, Tk), N);
OutputSeqDataDesc = make_seq_desc(N, B, T, Co_, FillSameValue(N*B, T), N*B);
// Set Attention Descriptor
CUDNN_CALL(cudnnSetAttnDescriptor(AttentionDesc, // cudnnAttnDescriptor_t
CUDNN_ATTN_ENABLE_PROJ_BIASES, // attnMode CUDNN_ATTN_QUERYMAP_ALL_TO_ONE/CUDNN_ATTN_QUERYMAP_ONE_TO_ONE & CUDNN_ATTN_DISABLE_PROJ_BIASES/CUDNN_ATTN_ENABLE_PROJ_BIASES
H, // nHeads
0.1, //smScaler
CUDNN_DATA_FLOAT, //cudnnDataType_t
CUDNN_DATA_FLOAT, //cudnnDataType_t
CUDNN_DEFAULT_MATH, //cudnnMathType_t
NULL, //cudnnDropoutDescriptor_t
NULL, //cudnnDropoutDescriptor_t
Cq, //qSize
Ck, //kSize
Cv, //vSize
Cq_, //qProjSize
Cq_, //kProSize
Cv_, //vProjSize
Co_, //oProjSize
T, //qoMaxSeqLength
Tk, //kvMaxSeqLength
N, //maxBatchSize
B)); //maxBeamSize
std::cout << "End Set Descriptor without Error" << std::endl;
//---------------------------------------------------------------------------------------------------------------------------------------------------------------//
//-----------------------------------------------------------Get Multi Buffer and Alloc--------------------------------------------------------------------------//
//---------------------------------------------------------------------------------------------------------------------------------------------------------------//
// Get Multi Buffer
size_t weightspacesize;
size_t workspacesize;
size_t reservespacesize;
CUDNN_CALL(cudnnGetMultiHeadAttnBuffers(cudnn, // cudnnHandle_t
AttentionDesc, // cudnnAttnDescriptor_t
&weightspacesize, // size_t*
&workspacesize, //size_t*
&reservespacesize)); //size_t*
float* weight = NULL;
float* workspace = NULL;
float* reserve = NULL;
float* data = (float*)malloc(weightspacesize);
for (int i = 0; i < (int)weightspacesize/sizeof(float); i++)
{
data[i] = rand();
}
CUDA_CALL(cudaMalloc(&workspace, workspacesize));
CUDA_CALL(cudaMalloc(&reserve, reservespacesize));
CUDA_CALL(cudaMalloc(&weight, weightspacesize));
//CUDA_CALL(cudaMemcpy(weight, data, weightspacesize, cudaMemcpyHostToDevice));
std::cout << "End Get Multi Buffer without Error" << std::endl;
//---------------------------------------------------------------------------------------------------------------------------------------------------------------//
//----------------------------------------------------------------------ATTN FORWARD------------------------------------------------------------------------------//
//---------------------------------------------------------------------------------------------------------------------------------------------------------------//
int* devSeqLengthsQ0Host = FillSameValue(N, T);
int* devSeqLengthsKVHost = FillSameValue(N, Tk);
int* devSeqLengthsQ0Dev;
int* devSeqLengthsKVDev;
CUDA_CALL(cudaMalloc(&devSeqLengthsQ0Dev, N * B * sizeof(int)));
CUDA_CALL(cudaMemcpy(devSeqLengthsQ0Dev, devSeqLengthsQ0Host, N * B * sizeof(int), cudaMemcpyHostToDevice));
CUDA_CALL(cudaMalloc(&devSeqLengthsKVDev, N * sizeof(int)));
CUDA_CALL(cudaMemcpy(devSeqLengthsKVDev, devSeqLengthsKVHost, N * sizeof(int), cudaMemcpyHostToDevice));
CUDNN_CALL(cudnnMultiHeadAttnForward(cudnn, // cudnnHandle_t
AttentionDesc, //cudnnAttnDescriptor_t
-1, //currIdx
FillSameValue(T, 0), //loWinIdx[]
FillSameValue(T, Tk), //hiWinIdx[]
devSeqLengthsQ0Dev, //devSeqLengthsQO[]
devSeqLengthsKVDev, //devSeqLengthsKV[]
QueriesSeqDataDesc, //cudnnSeqDataDescriptor_t
queries_data, //*queries
NULL, //*residuals
KeysSeqDataDesc, //cudnnSeqDataDescriptor_t
keys_data, //*keys
ValuesSeqDataDesc, //cudnnSeqDataDescriptor_t
values_data, //*values
OutputSeqDataDesc, //cudnnSeqDataDescriptor_t
out_data, //*out
weightspacesize, //size_t
weight, //*weights
workspacesize, //size_t
workspace, //*workSpace
reservespacesize, //size_t
reserve)); //*reserveSpace
std::cout << "multi_head_input" << std::endl;
print(queries_data, 1, T, Co_); // Display queries_data [0,:,:]
std::cout << "multi_head_output" << std::endl;
print(out_data, 1, T, Co_); // Display out_data [0,:,:]
}
File.cu (11.9 KB)
Cuda Version: 11.7
Graphics card: NVIDIA GeForce MX250
CuDNN Version: 8.4.1