I’m trying to implement LSTM with projection on cudnn version 7.1.4. According to the 7.1.1 release note https://docs.nvidia.com/deeplearning/sdk/cudnn-release-notes/rel_711.html#rel_711, LSTM with projection is supposed to work at least for unidirectional mode and inference. But unfortunately, I always get error code 3 calling cudnnRNNForwardInference. So my question is that does LSTMP really work for cudnn version 7.1.4 ?
My code is modified from official sample RNN as following, with just one addition line #329.
/**
* Copyright 2016 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#include <cuda.h>
#include <cudnn.h>
#include <stdio.h>
// Reference outputs (calculated on an M40 GPU)
// > ./RNN 20 2 512 64 0
// Forward: 1299 GFLOPs
// Backward: 2171 GFLOPs, (1564 GFLOPs), (3549 GFLOPs)
// i checksum 1.315793E+06 h checksum 1.315212E+05
// di checksum 6.676003E+01 dh checksum 6.425067E+01
// dw checksum 1.453750E+09
//
// > ./RNN 20 2 512 64 1
// Forward: 1296 GFLOPs
// Backward: 2235 GFLOPs, (1567 GFLOPs), (3896 GFLOPs)
// i checksum 6.319591E+05 h checksum 6.319605E+04
// di checksum 4.501830E+00 dh checksum 4.489546E+00
// dw checksum 5.012598E+07
//
// > ./RNN 20 2 512 64 2
// Forward: 2635 GFLOPs
// Backward: 2757 GFLOPs, (2001 GFLOPs), (4433 GFLOPs)
// i checksum 5.749536E+05 c checksum 4.365091E+05 h
// checksum 5.774818E+04 di checksum 3.842206E+02 dc checksum 9.323785E+03 dh
// checksum 1.182566E+01 dw checksum 4.313461E+08
//
// > ./RNN 20 2 512 64 3
// Forward: 2428 GFLOPs
// Backward: 2645 GFLOPs, (1915 GFLOPs), (4270 GFLOPs)
// i checksum 6.358978E+05 h checksum 6.281680E+04
// di checksum 6.296622E+00 dh checksum 2.289960E+05
// dw checksum 5.397419E+07
// Define some error checking macros.
#define cudaErrCheck(stat) \
{ cudaErrCheck_((stat), __FILE__, __LINE__); }
void cudaErrCheck_(cudaError_t stat, const char *file, int line) {
if (stat != cudaSuccess) {
fprintf(stderr,
"CUDA Error: %s %s %d\n",
cudaGetErrorString(stat),
file,
line);
}
}
#define cudnnErrCheck(stat) \
{ cudnnErrCheck_((stat), __FILE__, __LINE__); }
void cudnnErrCheck_(cudnnStatus_t stat, const char *file, int line) {
if (stat != CUDNN_STATUS_SUCCESS) {
fprintf(stderr,
"cuDNN Error: %s %s %d\n",
cudnnGetErrorString(stat),
file,
line);
}
}
__global__ void initGPUData_ker(float *data, int numElements, float value) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < numElements) {
data[tid] = value;
}
}
void initGPUData(float *data, int numElements, float value) {
dim3 gridDim;
dim3 blockDim;
blockDim.x = 1024;
gridDim.x = (numElements + blockDim.x - 1) / blockDim.x;
initGPUData_ker<<<gridDim, blockDim>>>(data, numElements, value);
}
int main(int argc, char *argv[]) {
int seqLength;
int numLayers;
int hiddenSize;
int inputSize;
int miniBatch;
float dropout;
bool bidirectional;
int mode;
int persistent;
FILE *fp;
fp = fopen("result.txt", "w");
if (argc == 6) {
seqLength = atoi(argv[1]);
numLayers = atoi(argv[2]);
hiddenSize = atoi(argv[3]);
inputSize = hiddenSize;
miniBatch = atoi(argv[4]);
dropout = 0;
bidirectional = 0;
mode = atoi(argv[5]);
persistent = 0;
} else {
printf("Usage:\n");
printf(
"./RNN <seqLength> <numLayers> <hiddenSize> <miniBatch> <mode>\n");
printf("Modes: 0 = RNN_RELU, 1 = RNN_TANH, 2 = LSTM, 3 = GRU\n");
return 1;
}
// -------------------------
// Create cudnn context
// -------------------------
cudnnHandle_t cudnnHandle;
cudnnErrCheck(cudnnCreate(&cudnnHandle));
// -------------------------
// Set up inputs and outputs
// -------------------------
void *x;
void *hx = NULL;
void *cx = NULL;
void *dx;
void *dhx = NULL;
void *dcx = NULL;
void *y;
void *hy = NULL;
void *cy = NULL;
void *dy;
void *dhy = NULL;
void *dcy = NULL;
// Memory allocation. hx, cx, dhx, dcx, hy, cy, dhy and dcy can be NULL.
cudaErrCheck(cudaMalloc((void **)&x,
seqLength * inputSize * miniBatch * sizeof(float)));
cudaErrCheck(cudaMalloc((void **)&hx,
numLayers * hiddenSize * miniBatch *
(bidirectional ? 2 : 1) * sizeof(float)));
cudaErrCheck(cudaMalloc((void **)&cx,
numLayers * hiddenSize * miniBatch *
(bidirectional ? 2 : 1) * sizeof(float)));
cudaErrCheck(cudaMalloc((void **)&dx,
seqLength * inputSize * miniBatch * sizeof(float)));
cudaErrCheck(cudaMalloc((void **)&dhx,
numLayers * hiddenSize * miniBatch *
(bidirectional ? 2 : 1) * sizeof(float)));
cudaErrCheck(cudaMalloc((void **)&dcx,
numLayers * hiddenSize * miniBatch *
(bidirectional ? 2 : 1) * sizeof(float)));
cudaErrCheck(cudaMalloc((void **)&y,
seqLength * hiddenSize * miniBatch *
(bidirectional ? 2 : 1) * sizeof(float)));
cudaErrCheck(cudaMalloc((void **)&hy,
numLayers * hiddenSize * miniBatch *
(bidirectional ? 2 : 1) * sizeof(float)));
cudaErrCheck(cudaMalloc((void **)&cy,
numLayers * hiddenSize * miniBatch *
(bidirectional ? 2 : 1) * sizeof(float)));
cudaErrCheck(cudaMalloc((void **)&dy,
seqLength * hiddenSize * miniBatch *
(bidirectional ? 2 : 1) * sizeof(float)));
cudaErrCheck(cudaMalloc((void **)&dhy,
numLayers * hiddenSize * miniBatch *
(bidirectional ? 2 : 1) * sizeof(float)));
cudaErrCheck(cudaMalloc((void **)&dcy,
numLayers * hiddenSize * miniBatch *
(bidirectional ? 2 : 1) * sizeof(float)));
// Set up tensor descriptors. x/y/dx/dy are arrays, one per time step.
cudnnTensorDescriptor_t *xDesc, *yDesc, *dxDesc, *dyDesc;
cudnnTensorDescriptor_t hxDesc, cxDesc;
cudnnTensorDescriptor_t hyDesc, cyDesc;
cudnnTensorDescriptor_t dhxDesc, dcxDesc;
cudnnTensorDescriptor_t dhyDesc, dcyDesc;
xDesc = (cudnnTensorDescriptor_t *)malloc(seqLength *
sizeof(cudnnTensorDescriptor_t));
yDesc = (cudnnTensorDescriptor_t *)malloc(seqLength *
sizeof(cudnnTensorDescriptor_t));
dxDesc = (cudnnTensorDescriptor_t *)malloc(seqLength *
sizeof(cudnnTensorDescriptor_t));
dyDesc = (cudnnTensorDescriptor_t *)malloc(seqLength *
sizeof(cudnnTensorDescriptor_t));
int dimA[3];
int strideA[3];
// In this example dimA[1] is constant across the whole sequence
// This isn't required, all that is required is that it does not increase.
for (int i = 0; i < seqLength; i++) {
cudnnErrCheck(cudnnCreateTensorDescriptor(&xDesc[i]));
cudnnErrCheck(cudnnCreateTensorDescriptor(&yDesc[i]));
cudnnErrCheck(cudnnCreateTensorDescriptor(&dxDesc[i]));
cudnnErrCheck(cudnnCreateTensorDescriptor(&dyDesc[i]));
dimA[0] = miniBatch;
dimA[1] = inputSize;
dimA[2] = 1;
strideA[0] = dimA[2] * dimA[1];
strideA[1] = dimA[2];
strideA[2] = 1;
cudnnErrCheck(cudnnSetTensorNdDescriptor(
xDesc[i], CUDNN_DATA_FLOAT, 3, dimA, strideA));
cudnnErrCheck(cudnnSetTensorNdDescriptor(
dxDesc[i], CUDNN_DATA_FLOAT, 3, dimA, strideA));
dimA[0] = miniBatch;
dimA[1] = bidirectional ? hiddenSize * 2 : hiddenSize;
dimA[2] = 1;
strideA[0] = dimA[2] * dimA[1];
strideA[1] = dimA[2];
strideA[2] = 1;
cudnnErrCheck(cudnnSetTensorNdDescriptor(
yDesc[i], CUDNN_DATA_FLOAT, 3, dimA, strideA));
cudnnErrCheck(cudnnSetTensorNdDescriptor(
dyDesc[i], CUDNN_DATA_FLOAT, 3, dimA, strideA));
}
dimA[0] = numLayers * (bidirectional ? 2 : 1);
dimA[1] = miniBatch;
dimA[2] = hiddenSize;
strideA[0] = dimA[2] * dimA[1];
strideA[1] = dimA[2];
strideA[2] = 1;
cudnnErrCheck(cudnnCreateTensorDescriptor(&hxDesc));
cudnnErrCheck(cudnnCreateTensorDescriptor(&cxDesc));
cudnnErrCheck(cudnnCreateTensorDescriptor(&hyDesc));
cudnnErrCheck(cudnnCreateTensorDescriptor(&cyDesc));
cudnnErrCheck(cudnnCreateTensorDescriptor(&dhxDesc));
cudnnErrCheck(cudnnCreateTensorDescriptor(&dcxDesc));
cudnnErrCheck(cudnnCreateTensorDescriptor(&dhyDesc));
cudnnErrCheck(cudnnCreateTensorDescriptor(&dcyDesc));
cudnnErrCheck(
cudnnSetTensorNdDescriptor(hxDesc, CUDNN_DATA_FLOAT, 3, dimA, strideA));
cudnnErrCheck(
cudnnSetTensorNdDescriptor(cxDesc, CUDNN_DATA_FLOAT, 3, dimA, strideA));
cudnnErrCheck(
cudnnSetTensorNdDescriptor(hyDesc, CUDNN_DATA_FLOAT, 3, dimA, strideA));
cudnnErrCheck(
cudnnSetTensorNdDescriptor(cyDesc, CUDNN_DATA_FLOAT, 3, dimA, strideA));
cudnnErrCheck(cudnnSetTensorNdDescriptor(
dhxDesc, CUDNN_DATA_FLOAT, 3, dimA, strideA));
cudnnErrCheck(cudnnSetTensorNdDescriptor(
dcxDesc, CUDNN_DATA_FLOAT, 3, dimA, strideA));
cudnnErrCheck(cudnnSetTensorNdDescriptor(
dhyDesc, CUDNN_DATA_FLOAT, 3, dimA, strideA));
cudnnErrCheck(cudnnSetTensorNdDescriptor(
dcyDesc, CUDNN_DATA_FLOAT, 3, dimA, strideA));
// -------------------------
// Set up the dropout descriptor (needed for the RNN descriptor)
// -------------------------
unsigned long long seed = 1337ull; // Pick a seed.
cudnnDropoutDescriptor_t dropoutDesc;
cudnnErrCheck(cudnnCreateDropoutDescriptor(&dropoutDesc));
// How much memory does dropout need for states?
// These states are used to generate random numbers internally
// and should not be freed until the RNN descriptor is no longer used
size_t stateSize;
void *states;
cudnnErrCheck(cudnnDropoutGetStatesSize(cudnnHandle, &stateSize));
cudaErrCheck(cudaMalloc(&states, stateSize));
cudnnErrCheck(cudnnSetDropoutDescriptor(
dropoutDesc, cudnnHandle, dropout, states, stateSize, seed));
// -------------------------
// Set up the RNN descriptor
// -------------------------
cudnnRNNDescriptor_t rnnDesc;
cudnnRNNMode_t RNNMode;
cudnnRNNAlgo_t RNNAlgo;
cudnnErrCheck(cudnnCreateRNNDescriptor(&rnnDesc));
if (mode == 0)
RNNMode = CUDNN_RNN_RELU;
else if (mode == 1)
RNNMode = CUDNN_RNN_TANH;
else if (mode == 2)
RNNMode = CUDNN_LSTM;
else if (mode == 3)
RNNMode = CUDNN_GRU;
// Persistent RNNs are only supported on Pascal+ GPUs.
if (persistent == 0)
RNNAlgo = CUDNN_RNN_ALGO_STANDARD;
else if (persistent == 1)
RNNAlgo = CUDNN_RNN_ALGO_PERSIST_STATIC;
else if (persistent == 2)
RNNAlgo = CUDNN_RNN_ALGO_PERSIST_DYNAMIC;
cudnnErrCheck(cudnnSetRNNDescriptor_v6(
cudnnHandle,
rnnDesc,
hiddenSize,
numLayers,
dropoutDesc,
CUDNN_LINEAR_INPUT, // We can also skip the input matrix transformation
bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL,
RNNMode,
RNNAlgo, // Can be changed to use persistent RNNs on Pascal+ GPUs.
CUDNN_DATA_FLOAT));
<b>cudnnErrCheck(cudnnSetRNNProjectionLayers(cudnnHandle, rnnDesc, hiddenSize / 2, 0));</b>
// -------------------------
// Set up parameters
// -------------------------
// This needs to be done after the rnn descriptor is set as otherwise
// we don't know how many parameters we have to allocate
void *w;
void *dw;
cudnnFilterDescriptor_t wDesc, dwDesc;
cudnnErrCheck(cudnnCreateFilterDescriptor(&wDesc));
cudnnErrCheck(cudnnCreateFilterDescriptor(&dwDesc));
size_t weightsSize;
cudnnErrCheck(cudnnGetRNNParamsSize(
cudnnHandle, rnnDesc, xDesc[0], &weightsSize, CUDNN_DATA_FLOAT));
int dimW[3];
dimW[0] = weightsSize / sizeof(float);
dimW[1] = 1;
dimW[2] = 1;
cudnnErrCheck(cudnnSetFilterNdDescriptor(
wDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dimW));
cudnnErrCheck(cudnnSetFilterNdDescriptor(
dwDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dimW));
cudaErrCheck(cudaMalloc((void **)&w, weightsSize));
cudaErrCheck(cudaMalloc((void **)&dw, weightsSize));
// -------------------------
// Set up work space and reserved memory
// -------------------------
void *workspace;
void *reserveSpace;
size_t workSize;
size_t reserveSize;
// Need for every pass
cudnnErrCheck(cudnnGetRNNWorkspaceSize(
cudnnHandle, rnnDesc, seqLength, xDesc, &workSize));
// Only needed in training, shouldn't be touched between passes.
cudnnErrCheck(cudnnGetRNNTrainingReserveSize(
cudnnHandle, rnnDesc, seqLength, xDesc, &reserveSize));
cudaErrCheck(cudaMalloc((void **)&workspace, workSize));
cudaErrCheck(cudaMalloc((void **)&reserveSpace, reserveSize));
// *********************************************************************************************************
// Initialise weights and inputs
// *********************************************************************************************************
// We initialise to something simple.
// Matrices are initialised to 1 / matrixSize, biases to 1, data is 1.
initGPUData((float *)x, seqLength * inputSize * miniBatch, 1.f);
if (hx != NULL)
initGPUData((float *)hx,
numLayers * hiddenSize * miniBatch *
(bidirectional ? 2 : 1),
1.f);
if (cx != NULL)
initGPUData((float *)cx,
numLayers * hiddenSize * miniBatch *
(bidirectional ? 2 : 1),
1.f);
initGPUData((float *)dy,
seqLength * hiddenSize * miniBatch * (bidirectional ? 2 : 1),
1.f);
if (dhy != NULL)
initGPUData((float *)dhy,
numLayers * hiddenSize * miniBatch *
(bidirectional ? 2 : 1),
1.f);
if (dcy != NULL)
initGPUData((float *)dcy,
numLayers * hiddenSize * miniBatch *
(bidirectional ? 2 : 1),
1.f);
// Weights
int numLinearLayers = 0;
if (RNNMode == CUDNN_RNN_RELU || RNNMode == CUDNN_RNN_TANH) {
numLinearLayers = 2;
} else if (RNNMode == CUDNN_LSTM) {
numLinearLayers = 8;
} else if (RNNMode == CUDNN_GRU) {
numLinearLayers = 6;
}
for (int layer = 0; layer < numLayers * (bidirectional ? 2 : 1); layer++) {
for (int linLayerID = 0; linLayerID < numLinearLayers; linLayerID++) {
cudnnFilterDescriptor_t linLayerMatDesc;
cudnnErrCheck(cudnnCreateFilterDescriptor(&linLayerMatDesc));
float *linLayerMat;
cudnnErrCheck(
cudnnGetRNNLinLayerMatrixParams(cudnnHandle,
rnnDesc,
layer,
xDesc[0],
wDesc,
w,
linLayerID,
linLayerMatDesc,
(void **)&linLayerMat));
cudnnDataType_t dataType;
cudnnTensorFormat_t format;
int nbDims;
int filterDimA[3];
cudnnErrCheck(cudnnGetFilterNdDescriptor(
linLayerMatDesc, 3, &dataType, &format, &nbDims, filterDimA));
printf("layer=%d weights Dim[0]=%d, Dim[1]=%d, Dim[2]=%d\n",
layer,
filterDimA[0],
filterDimA[1],
filterDimA[2]);
initGPUData(
linLayerMat,
filterDimA[0] * filterDimA[1] * filterDimA[2],
1.f / (float)(filterDimA[0] * filterDimA[1] * filterDimA[2]));
cudnnErrCheck(cudnnDestroyFilterDescriptor(linLayerMatDesc));
cudnnFilterDescriptor_t linLayerBiasDesc;
cudnnErrCheck(cudnnCreateFilterDescriptor(&linLayerBiasDesc));
float *linLayerBias;
cudnnErrCheck(
cudnnGetRNNLinLayerBiasParams(cudnnHandle,
rnnDesc,
layer,
xDesc[0],
wDesc,
w,
linLayerID,
linLayerBiasDesc,
(void **)&linLayerBias));
cudnnErrCheck(cudnnGetFilterNdDescriptor(
linLayerBiasDesc, 3, &dataType, &format, &nbDims, filterDimA));
printf("layer=%d bias Dim[0]=%d, Dim[1]=%d, Dim[2]=%d\n",
layer,
filterDimA[0],
filterDimA[1],
filterDimA[2]);
initGPUData(linLayerBias,
filterDimA[0] * filterDimA[1] * filterDimA[2],
1.f);
cudnnErrCheck(cudnnDestroyFilterDescriptor(linLayerBiasDesc));
}
}
// *********************************************************************************************************
// Dynamic persistent RNN plan (if using this algo)
// *********************************************************************************************************
cudnnPersistentRNNPlan_t rnnPlan;
if (RNNAlgo == CUDNN_RNN_ALGO_PERSIST_DYNAMIC) {
// Note: This step is expensive. Once completed the plan can be reused
// so long as the descriptor
// minibatch or datatype don't change.
cudnnErrCheck(cudnnCreatePersistentRNNPlan(
rnnDesc, miniBatch, CUDNN_DATA_FLOAT, &rnnPlan));
// Tell calls using this descriptor which plan to use.
cudnnErrCheck(cudnnSetPersistentRNNPlan(rnnDesc, rnnPlan));
}
// *********************************************************************************************************
// At this point all of the setup is done. We now need to pass through the
// RNN.
// *********************************************************************************************************
cudaErrCheck(cudaDeviceSynchronize());
cudaEvent_t start, stop;
float timeForward, timeBackward1, timeBackward2;
cudaErrCheck(cudaEventCreate(&start));
cudaErrCheck(cudaEventCreate(&stop));
cudaErrCheck(cudaEventRecord(start));
// If we're not training we use this instead
cudnnErrCheck(cudnnRNNForwardInference(cudnnHandle,
rnnDesc,
seqLength,
xDesc,
x,
hxDesc,
hx,
cxDesc,
cx,
wDesc,
w,
yDesc,
y,
hyDesc,
hy,
cyDesc,
cy,
workspace,
workSize));
// cudnnErrCheck(cudnnRNNForwardTraining(cudnnHandle,
// rnnDesc,
// seqLength,
// xDesc,
// x,
// hxDesc,
// hx,
// cxDesc,
// cx,
// wDesc,
// w,
// yDesc,
// y,
// hyDesc,
// hy,
// cyDesc,
// cy,
// workspace,
// workSize,
// reserveSpace,
// reserveSize));
cudaErrCheck(cudaEventRecord(stop));
cudaErrCheck(cudaEventSynchronize(stop));
cudaErrCheck(cudaEventElapsedTime(&timeForward, start, stop));
cudaErrCheck(cudaEventRecord(start));
// cudnnErrCheck(cudnnRNNBackwardData(cudnnHandle,
// rnnDesc,
// seqLength,
// yDesc,
// y,
// dyDesc,
// dy,
// dhyDesc,
// dhy,
// dcyDesc,
// dcy,
// wDesc,
// w,
// hxDesc,
// hx,
// cxDesc,
// cx,
// dxDesc,
// dx,
// dhxDesc,
// dhx,
// dcxDesc,
// dcx,
// workspace,
// workSize,
// reserveSpace,
// reserveSize));
cudaErrCheck(cudaEventRecord(stop));
cudaErrCheck(cudaEventSynchronize(stop));
cudaErrCheck(cudaEventElapsedTime(&timeBackward1, start, stop));
cudaErrCheck(cudaEventRecord(start));
// cudnnRNNBackwardWeights adds to the data in dw.
// cudaErrCheck(cudaMemset(dw, 0, weightsSize));
// cudnnErrCheck(cudnnRNNBackwardWeights(cudnnHandle,
// rnnDesc,
// seqLength,
// xDesc,
// x,
// hxDesc,
// hx,
// yDesc,
// y,
// workspace,
// workSize,
// dwDesc,
// dw,
// reserveSpace,
// reserveSize));
cudaErrCheck(cudaEventRecord(stop));
cudaErrCheck(cudaEventSynchronize(stop));
cudaErrCheck(cudaEventElapsedTime(&timeBackward2, start, stop));
int numMats = 0;
if (RNNMode == CUDNN_RNN_RELU || RNNMode == CUDNN_RNN_TANH) {
numMats = 2;
} else if (RNNMode == CUDNN_LSTM) {
numMats = 8;
} else if (RNNMode == CUDNN_GRU) {
numMats = 6;
}
// Calculate FLOPS
printf("Forward: %3.0f GFLOPS\n",
numMats * 2ull * (bidirectional ? 2 : 1) * hiddenSize * hiddenSize *
seqLength * miniBatch * numLayers / (1e6 * timeForward));
printf("Backward: %3.0f GFLOPS, ",
numMats * 4ull * (bidirectional ? 2 : 1) * hiddenSize * hiddenSize *
seqLength * miniBatch * numLayers /
(1e6 * (timeBackward1 + timeBackward2)));
printf("(%3.0f GFLOPS), ",
numMats * 2ull * (bidirectional ? 2 : 1) * hiddenSize * hiddenSize *
seqLength * miniBatch * numLayers / (1e6 * timeBackward1));
printf("(%3.0f GFLOPS)\n",
numMats * 2ull * (bidirectional ? 2 : 1) * hiddenSize * hiddenSize *
seqLength * miniBatch * numLayers / (1e6 * timeBackward2));
// Calculate FLOPS
fprintf(fp,
"Forward: %3.0f GFLOPS\n",
numMats * 2ull * (bidirectional ? 2 : 1) * hiddenSize * hiddenSize *
seqLength * miniBatch * numLayers / (1e6 * timeForward));
fprintf(fp,
"Backward: %3.0f GFLOPS, ",
numMats * 4ull * (bidirectional ? 2 : 1) * hiddenSize * hiddenSize *
seqLength * miniBatch * numLayers /
(1e6 * (timeBackward1 + timeBackward2)));
fprintf(fp,
"(%3.0f GFLOPS), ",
numMats * 2ull * (bidirectional ? 2 : 1) * hiddenSize * hiddenSize *
seqLength * miniBatch * numLayers / (1e6 * timeBackward1));
fprintf(fp,
"(%3.0f GFLOPS)\n",
numMats * 2ull * (bidirectional ? 2 : 1) * hiddenSize * hiddenSize *
seqLength * miniBatch * numLayers / (1e6 * timeBackward2));
// Make double-sure everything is finished before we copy for result
// checking.
cudaDeviceSynchronize();
// *********************************************************************************************************
// Print checksums.
// *********************************************************************************************************
if (true) {
float *testOutputi;
float *testOutputh;
float *testOutputc;
int biDirScale = (bidirectional ? 2 : 1);
testOutputi = (float *)malloc(hiddenSize * seqLength * miniBatch *
biDirScale * sizeof(float));
testOutputh = (float *)malloc(hiddenSize * miniBatch * numLayers *
biDirScale * sizeof(float));
testOutputc = (float *)malloc(hiddenSize * miniBatch * numLayers *
biDirScale * sizeof(float));
cudaErrCheck(cudaMemcpy(testOutputi,
y,
hiddenSize * seqLength * miniBatch *
biDirScale * sizeof(float),
cudaMemcpyDeviceToHost));
if (hy != NULL)
cudaErrCheck(cudaMemcpy(testOutputh,
hy,
numLayers * hiddenSize * miniBatch *
biDirScale * sizeof(float),
cudaMemcpyDeviceToHost));
if (cy != NULL && RNNMode == CUDNN_LSTM)
cudaErrCheck(cudaMemcpy(testOutputc,
cy,
numLayers * hiddenSize * miniBatch *
biDirScale * sizeof(float),
cudaMemcpyDeviceToHost));
double checksumi = 0.f;
double checksumh = 0.f;
double checksumc = 0.f;
for (int m = 0; m < miniBatch; m++) {
double localSumi = 0;
double localSumh = 0;
double localSumc = 0;
for (int j = 0; j < seqLength; j++) {
for (int i = 0; i < hiddenSize * biDirScale; i++) {
localSumi +=
testOutputi[j * miniBatch * hiddenSize * biDirScale +
m * hiddenSize * biDirScale + i];
}
}
for (int j = 0; j < numLayers * biDirScale; j++) {
for (int i = 0; i < hiddenSize; i++) {
if (hy != NULL)
localSumh += testOutputh[j * hiddenSize * miniBatch +
m * hiddenSize + i];
if (cy != NULL)
if (RNNMode == CUDNN_LSTM)
localSumc +=
testOutputc[j * hiddenSize * miniBatch +
m * hiddenSize + i];
}
}
checksumi += localSumi;
checksumh += localSumh;
checksumc += localSumc;
}
printf("i checksum %E ", checksumi);
fprintf(fp, "i checksum %E ", checksumi);
if (RNNMode == CUDNN_LSTM) {
printf("c checksum %E ", checksumc);
fprintf(fp, "c checksum %E ", checksumc);
}
printf("h checksum %E\n", checksumh);
fprintf(fp, "h checksum %E\n", checksumh);
free(testOutputi);
free(testOutputc);
free(testOutputh);
}
if (true) {
float *testOutputdi;
float *testOutputdh;
float *testOutputdc;
int biDirScale = (bidirectional ? 2 : 1);
testOutputdi =
(float *)malloc(inputSize * seqLength * miniBatch * sizeof(float));
testOutputdh = (float *)malloc(hiddenSize * miniBatch * numLayers *
biDirScale * sizeof(float));
testOutputdc = (float *)malloc(hiddenSize * miniBatch * numLayers *
biDirScale * sizeof(float));
cudaErrCheck(
cudaMemcpy(testOutputdi,
dx,
seqLength * miniBatch * inputSize * sizeof(float),
cudaMemcpyDeviceToHost));
if (dhx != NULL)
cudaErrCheck(cudaMemcpy(testOutputdh,
dhx,
numLayers * hiddenSize * miniBatch *
biDirScale * sizeof(float),
cudaMemcpyDeviceToHost));
if (dcx != NULL)
if (RNNMode == CUDNN_LSTM)
cudaErrCheck(cudaMemcpy(testOutputdc,
dcx,
numLayers * hiddenSize * miniBatch *
biDirScale * sizeof(float),
cudaMemcpyDeviceToHost));
float checksumdi = 0.f;
float checksumdh = 0.f;
float checksumdc = 0.f;
for (int m = 0; m < miniBatch; m++) {
double localSumdi = 0;
double localSumdh = 0;
double localSumdc = 0;
for (int j = 0; j < seqLength; j++) {
for (int i = 0; i < inputSize; i++) {
localSumdi += testOutputdi[j * miniBatch * inputSize +
m * inputSize + i];
}
}
for (int j = 0; j < numLayers * biDirScale; j++) {
for (int i = 0; i < hiddenSize; i++) {
localSumdh += testOutputdh[j * hiddenSize * miniBatch +
m * hiddenSize + i];
if (RNNMode == CUDNN_LSTM)
localSumdc += testOutputdc[j * hiddenSize * miniBatch +
m * hiddenSize + i];
}
}
checksumdi += localSumdi;
checksumdh += localSumdh;
checksumdc += localSumdc;
}
printf("di checksum %E ", checksumdi);
fprintf(fp, "di checksum %E ", checksumdi);
if (RNNMode == CUDNN_LSTM) {
printf("dc checksum %E ", checksumdc);
fprintf(fp, "dc checksum %E ", checksumdc);
}
printf("dh checksum %E\n", checksumdh);
fprintf(fp, "dh checksum %E\n", checksumdh);
free(testOutputdi);
free(testOutputdh);
free(testOutputdc);
}
if (true) {
float *testOutputdw;
testOutputdw = (float *)malloc(weightsSize);
cudaErrCheck(
cudaMemcpy(testOutputdw, dw, weightsSize, cudaMemcpyDeviceToHost));
double checksumdw = 0.;
for (int i = 0; i < weightsSize / sizeof(float); i++) {
checksumdw += testOutputdw[i];
}
printf("dw checksum %E\n", checksumdw);
fprintf(fp, "dw checksum %E\n", checksumdw);
free(testOutputdw);
}
if (RNNAlgo == CUDNN_RNN_ALGO_PERSIST_DYNAMIC) {
cudnnDestroyPersistentRNNPlan(rnnPlan);
}
cudaFree(x);
cudaFree(hx);
cudaFree(cx);
cudaFree(y);
cudaFree(hy);
cudaFree(cy);
cudaFree(dx);
cudaFree(dhx);
cudaFree(dcx);
cudaFree(dy);
cudaFree(dhy);
cudaFree(dcy);
cudaFree(workspace);
cudaFree(reserveSpace);
cudaFree(w);
cudaFree(dw);
cudaFree(states);
for (int i = 0; i < seqLength; i++) {
cudnnDestroyTensorDescriptor(xDesc[i]);
cudnnDestroyTensorDescriptor(yDesc[i]);
cudnnDestroyTensorDescriptor(dxDesc[i]);
cudnnDestroyTensorDescriptor(dyDesc[i]);
}
cudnnDestroyTensorDescriptor(hxDesc);
cudnnDestroyTensorDescriptor(cxDesc);
cudnnDestroyTensorDescriptor(hyDesc);
cudnnDestroyTensorDescriptor(cyDesc);
cudnnDestroyTensorDescriptor(dhxDesc);
cudnnDestroyTensorDescriptor(dcxDesc);
cudnnDestroyTensorDescriptor(dhyDesc);
cudnnDestroyTensorDescriptor(dcyDesc);
cudnnDestroyDropoutDescriptor(dropoutDesc);
cudnnDestroyRNNDescriptor(rnnDesc);
cudnnDestroyFilterDescriptor(wDesc);
cudnnDestroyFilterDescriptor(dwDesc);
free(xDesc);
free(yDesc);
free(dxDesc);
free(dyDesc);
cudnnDestroy(cudnnHandle);
fclose(fp);
return 0;
}