What does Error: CUDNN_STATUS_BAD_PARAM; Reason: 0 != wDimA[0] % g mean?

Hello,
I am using cudnn-frontend to piece together custom internal networks using C++.
I have successfully chained single one-channel convolutions, and multiple channel inputs. I have not been successful when output channels to any graph is not 1. When trying various combinations of dimensions and strides for image, filter, and output, sometimes I get error: Error: CUDNN_STATUS_BAD_PARAM; Reason: 0 != wDimA[0] % g. Any hints on what that means?
Thanks

Hi @westwood ,
can you help us with the reproducible snippet?

Thanks

I will work on a more exacting snippet, but it is essentially the first TEST_CASE in cudnn-frontend/samples/cpp/convolution/fprop.cpp with n = 1, c = 1, h = 64, w = 64, r = s = 3, and k = 2. I find this error in the loop over image and filter dimensions and strides. Worse though, I never find a correct output of the convolution. I have also looped over setting dimensions and strides for the y_tensor, no luck. I can however find correct results when k = 1 and c != 1, and various h = w, r = s sizes.

/*

  • Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
  • Permission is hereby granted, free of charge, to any person obtaining a
  • copy of this software and associated documentation files (the “Software”),
  • to deal in the Software without restriction, including without limitation
  • the rights to use, copy, modify, merge, publish, distribute, sublicense,
  • and/or sell copies of the Software, and to permit persons to whom the
  • Software is furnished to do so, subject to the following conditions:
  • The above copyright notice and this permission notice shall be included in
  • all copies or substantial portions of the Software.
  • THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  • IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  • FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  • THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  • LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  • FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  • DEALINGS IN THE SOFTWARE.
    */
    #pragma GCC diagnostic push
    #pragma GCC diagnostic ignored “-Wsuggest-override”
    #pragma GCC diagnostic ignored “-Wunused-function”
    include <cudnn_frontend.h>
    include <helpers.h>
    #pragma GCC diagnostic pop

include
include

////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////
std::vector <int64_t> formDim (const std::vector <int64_t> dims, unsigned dimForm)
{
auto n = dims [0],
c = dims [1],
h = dims [2],
w = dims [3];
switch (dimForm)
{
// nk c hr ws
case 0: {return std::vector <int64_t> ({n, c, h, w}); break;} // idim, kdim main
case 1: {return std::vector <int64_t> ({n, c, w, h}); break;}
case 2: {return std::vector <int64_t> ({n, h, c, w}); break;}
case 3: {return std::vector <int64_t> ({n, h, w, c}); break;}
case 4: {return std::vector <int64_t> ({n, w, h, c}); break;}
case 5: {return std::vector <int64_t> ({n, w, c, h}); break;}

  case 6: {return std::vector  <int64_t>  ({c, n, h, w}); break;}
  case 7: {return std::vector  <int64_t>  ({c, n, w, h}); break;}
  case 8: {return std::vector  <int64_t>  ({c, h, n, w}); break;}
  case 9: {return std::vector  <int64_t>  ({c, h, w, n}); break;}
  case 10: {return std::vector <int64_t>  ({c, w, h, n}); break;}
  case 11: {return std::vector <int64_t>  ({c, w, n, h}); break;}

  case 12: {return std::vector <int64_t>  ({h, c, n, w}); break;}
  case 13: {return std::vector <int64_t>  ({h, c, w, n}); break;}
  case 14: {return std::vector <int64_t>  ({h, w, c, n}); break;}
  case 15: {return std::vector <int64_t>  ({h, w, n, c}); break;}
  case 16: {return std::vector <int64_t>  ({h, n, c, w}); break;}
  case 17: {return std::vector <int64_t>  ({h, n, w, c}); break;}

  case 18: {return std::vector <int64_t>  ({w, c, n, h}); break;}
  case 19: {return std::vector <int64_t>  ({w, c, h, n}); break;}
  case 20: {return std::vector <int64_t>  ({w, h, c, n}); break;}
  case 21: {return std::vector <int64_t>  ({w, h, n, c}); break;}
  case 22: {return std::vector <int64_t>  ({w, n, h, c}); break;}
  case 23: {return std::vector <int64_t>  ({w, n, c, h}); break;}
  default:
  {
     std::cout << "WARNING hit default dim at kind = " << dimForm << std::endl;
     return std::vector <int64_t> ({1, 1, 1, 1});
  }

}
}

////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////
std::vector <int64_t> formStride (const std::vector <int64_t>& dims, unsigned kind)
{
// for kernel k c r s, must return 1, 1, s * c, c
int64_t k = dims [0],
c = dims [1],
r = dims [2],
s = dims [3];
switch (kind)
{ // k (n) c r (h) s (w)
case 0: {return std::vector <int64_t> ({1, k, k * c, k * c * r}); break;}
case 1: {return std::vector <int64_t> ({1, k, k * c * s, k * c }); break;} // main kstride
case 2: {return std::vector <int64_t> ({1, k * r, k, k * c * r}); break;}
case 3: {return std::vector <int64_t> ({1, k * r * s, k, k * r }); break;}
case 4: {return std::vector <int64_t> ({1, k * s, k * c * s, k }); break;}
case 5: {return std::vector <int64_t> ({1, k * r * s, k * s, k }); break;}

  case 6: {return std::vector <int64_t>  ({c,           1,          k * c,     k * c * r}); break;}
  case 7: {return std::vector <int64_t>  ({c,           1,          k * c * s, k * c    }); break;}
  case 9: {return std::vector <int64_t>  ({c * r,       1,          c,         k * c * r}); break;}
  case 8: {return std::vector <int64_t>  ({c * r * s,   1,          c,         c * r    }); break;}
  case 10: {return std::vector <int64_t> ({c * s,       1,          k * c * s, c,       }); break;}
  case 11: {return std::vector <int64_t> ({c * r * s,   1,          c * s,     c,       }); break;}

  case 12: {return std::vector <int64_t> ({r,           r * k,     1,          r * k * c}); break;}
  case 13: {return std::vector <int64_t> ({r,           r * k * s, 1,          r * k    }); break;}
  case 14: {return std::vector <int64_t> ({r * c,       r,         1,          r * c * k}); break;}
  case 15: {return std::vector <int64_t> ({r * c * s,   r,         1,          r * c    }); break;}
  case 16: {return std::vector <int64_t> ({r * s,       r * s * k, 1,          r        }); break;}
  case 17: {return std::vector <int64_t> ({r * s * c,   r * s,     1,          r        }); break;}

  case 18: {return std::vector <int64_t> ({s,           s * k,     s * k * c,  1        }); break;}
  case 19: {return std::vector <int64_t> ({s,           s * k * r, s * k,      1        }); break;}
  case 20: {return std::vector <int64_t> ({s * c,       s,         s * c * k,  1        }); break;}
  case 21: {return std::vector <int64_t> ({s * c * r,   s,         s * c,      1        }); break;}
  case 22: {return std::vector <int64_t> ({s * r,       s * r * k, s,          1        }); break;}
  case 23: {return std::vector <int64_t> ({s * r * c,   s * r,     s,          1        }); break;} // main istride
  default:
  {
     std::cout << "WARNING hit default at kind = " << kind << std::endl;
     return std::vector <int64_t> ({1, 1, 1, 1});
  }

}
}

////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////
void valsToSurface (std::vector & valArray, Surface & to)
{
auto numElts = valArray.size ();
for (size_t idx = 0; idx < numElts; ++idx)
{
to.hostPtr [idx] = valArray [idx];
}
cudaMemcpy (to.devPtr, to.hostPtr, size_t (sizeof (to.hostPtr [0]) * numElts), cudaMemcpyHostToDevice);
}

////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////
void img2Surface (Surface & to)
{
std::vector img ({1,-1,2,-2,3,-3,4,-4,5,-5,6,-6,7,-7,8,-8,9,-9,10,-10,11,-11,12,-12,13,-13,14,-14,15,-15,16,-16,17,-17,18,-18,19,-19,20,-20,21,-21,22,-22,23,-23,24,-24,25,-25,26,-26,27,-27,28,-28,29,-29,30,-30,31,-31,32,-32});
valsToSurface (img, to);
}

////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////
void kern2Surface (Surface & to)
{
std::vector kern ({5,-4,4,-3,3,-2,2,-1,1,-9,9,-8,8,-7,7,-6,6,-5});
valsToSurface (kern, to);
};

////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////
void zero2Surface (Surface & to)
{
std::vector from (to.n_elems, 0.0);
valsToSurface (from, to);
}

////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////
unsigned setShowNCompare (const Surface & s, unsigned idim, unsigned kdim, unsigned istride, unsigned kstride)
{
std::vector to;
cudaMemcpy (s.hostPtr, s.devPtr, size_t (sizeof (s.hostPtr [0]) * s.n_elems), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
for (int64_t sidx = 0; sidx < s.n_elems; ++sidx)
{
float val = s.hostPtr [sidx];
to.push_back (val);
}

std::vector control ({-3, 60, -52, 77, -69, 94, -86, 12, -30, 135, -123, 156, -144, 177, -165, 24, -54, 219, -207, 240, -228, 261, -249, 36, -78, 303, -291, 324, -312, 345, -333, 48, -102, 387, -375, 408, -396, 429, -417, 60, -126, 471, -459, 492, -480, 513, -501, 72, -150, 555, -543, 576, -564, 597, -585, 84, -207, 402, -397, 416, -411, 430, -425, 60, -6, -43, 28, -49, 34, -55, 40, 24, -15, -72, 54, -81, 63, -90, 72, 48, -27, -108, 90, -117, 99, -126, 108, 72, -39, -144, 126, -153, 135, -162, 144, 96, -51, -180, 162, -189, 171, -198, 180, 120, -63, -216, 198, -225, 207, -234, 216, 144, -75, -252, 234, -261, 243, -270, 252, 168, -54, -85, 73, -88, 76, -91, 79, 228});
unsigned count = 0;

// set intersection is easier match than identical vector
#if 1
std::multiset lclcontrol (control.begin (), control.end ());
std::multiset toSet (to.begin (), to.end ());
std::vector intersection;
std::set_intersection (lclcontrol.begin (), lclcontrol.end (), toSet.begin (), toSet.end (),
std::back_inserter (intersection));
count = intersection.size ();
#else
for (size_t toIdx = 0; toIdx < to.size (); ++toIdx)
{
if (to [toIdx] == control [toIdx])
{
++count;
}
}

endif
return count;
}

////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////
unsigned runFor (unsigned idimForm, unsigned kdimForm, unsigned istrideForm, unsigned kstrideForm)
{
// most of this code is from frontend/samples/cpp/convolutions.cpp
namespace fe = cudnn_frontend;

// n = batch
// c = input channels
// h = height
// w = width
// k = output channels
// r = kernel height
// s = kernel width
int64_t n = 1, c = 1, h = 8, w = 8, k = 2, r = 3, s = 3;

auto build_new_graph = [&](cudnnHandle_t handle) {
auto graph = std::make_sharedfe::graph::Graph();
graph->set_io_data_type(fe::DataType_t::FLOAT).set_compute_data_type(fe::DataType_t::FLOAT);

  std::vector <int64_t> idims = formDim ({n, c, h, w}, idimForm);
  auto X = graph->tensor(fe::graph::Tensor_attributes()
        .set_name("image")

// .set_dim({n, c, h, w})
.set_dim(idims)
// .set_stride({c * h * w, 1, c * w, c}));
.set_stride(formStride (idims, istrideForm)));

  std::vector <int64_t> kdims = formDim ({k, c, r, s}, kdimForm);
  auto W = graph->tensor(fe::graph::Tensor_attributes()
        .set_name("filter")

// .set_dim({k, c, r, s})
.set_dim(kdims)
// .set_stride({c * r * s, 1, c * s, c}));
.set_stride(formStride (kdims, kstrideForm)));

  auto conv_options =
     fe::graph::Conv_fprop_attributes().set_padding({1, 1}).set_stride({1, 1}).set_dilation({1, 1});
  auto Y = graph->conv_fprop(X, W, conv_options);

  Y->set_output(true);

  REQUIRE(graph->validate().is_good());

  REQUIRE(graph->build_operation_graph(handle).is_good());

  REQUIRE(graph->create_execution_plans({fe::HeurMode_t::A}).is_good());

  REQUIRE(graph->check_support(handle).is_good());

  REQUIRE(graph->build_plans(handle).is_good());

  return std::make_tuple(graph, X, W, Y);

};

cudnnHandle_t handle;

checkCudnnErr(cudnnCreate(&handle));

auto [graph, X, W, Y] = build_new_graph(handle);

Surface x_tensor(n * c * h * w, false);
Surface w_tensor(k * c * r * s, false);
Surface y_tensor(n * k * h * w, false); // Should be p, q.
img2Surface (x_tensor);
kern2Surface (w_tensor);
zero2Surface (y_tensor);

std::unordered_map<int64_t, void*> variant_pack = {
{X->get_uid(), x_tensor.devPtr}, {W->get_uid(), w_tensor.devPtr}, {Y->get_uid(), y_tensor.devPtr}};

Surface<int8_t> workspace(graph->get_workspace_size(), false);

std::cout << *graph << std::endl;

cudaDeviceSynchronize();
REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());

unsigned match = setShowNCompare (y_tensor, idimForm, kdimForm, istrideForm, kstrideForm);
cudnnDestroy (handle);
return match;
}

////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////
unsigned runFor (unsigned idimForm, unsigned kdimForm, unsigned istrideForm, unsigned kstrideForm)
unsigned runFor (unsigned idimForm, unsigned kdimForm, unsigned istrideForm, unsigned kstrideForm)
int main (int argc, char* argv )
{
unsigned idim = atoi (argv [1]),
kdim = atoi (argv [2]),
istride = atoi (argv [3]),
kstride = atoi (argv [4]);
std::cout << runFor (idim, kdim, istride, kstride) << " exit good" << std::endl;
return 0;
};

The control comes from match in our tool’s use of tensorflow. I run using a shell script that loops over idim, kdim, istrides, kstrides. Looks like it is going to be another 40 or so hours, but best match so far is 22. I expect to match 2 * 6 * 6 due to valid region padding.

toNV.tar.gz (825 Bytes)
Tarred cpp and script for convenience

Hello,
We do a convolution not correlation in our networks. My management has asked to reproduce inputs and outputs in tensorflow. Here is script:
import tensorflow.python.keras.layers as Layer
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Input
import numpy as np

image = np.array ([[ 1, -1, 2, -2, 3, -3, 4, -4],
[ 5, -5, 6, -6, 7, -7, 8, -8],
[ 9, -9, 10, -10, 11, -11, 12, -12],
[13, -13, 14, -14, 15, -15, 16, -16],
[17, -17, 18, -18, 19, -19, 20, -20],
[21, -21, 22, -22, 23, -23, 24, -24],
[25, -25, 26, -26, 27, -27, 28, -28],
[29, -29, 30, -30, 31, -31, 32, -32]], dtype = np.float32)

weights = [np.array ([[[[-4, 5]],
[[-3, 4]],
[[-2, 3]]],
[[[-1, 2]],
[[-9, 1]],
[[-8, 9]]],
[[[-7, 8]],
[[-6, 7]],
[[-5, 6]]]], dtype = np.float32), np.array ([0,0], dtype = np.float32)]

control_answer = np.array ([-3, 60, -52, 77, -69, 94, -86, 12, -30, 135, -123, 156, -144, 177, -165, 24, -54, 219, -207, 240, -228, 261, -249, 36, -78, 303, -291, 324, -312, 345, -333, 48, -102, 387, -375, 408, -396, 429, -417, 60, -126, 471, -459, 492, -480, 513, -501, 72, -150, 555, -543, 576, -564, 597, -585, 84, -207, 402, -397, 416, -411, 430, -425, 60, -6, -43, 28, -49, 34, -55, 40, 24, -15, -72, 54, -81, 63, -90, 72, 48, -27, -108, 90, -117, 99, -126, 108, 72, -39, -144, 126, -153, 135, -162, 144, 96, -51, -180, 162, -189, 171, -198, 180, 120, -63, -216, 198, -225, 207, -234, 216, 144, -75, -252, 234, -261, 243, -270, 252, 168, -54, -85, 73, -88, 76, -91, 79, 228], dtype = np.float32)

batch = 1
imageWidth = 8
imageHeight = 8
inputChannels = 1
outputChannels = 2
weightsWidth = 3
weightsHeight = 3

image_shape = (imageWidth, imageHeight, inputChannels)

inputs = Input (shape = image_shape, name = “image”)
conv = Layer.Conv2D (filters = outputChannels,
kernel_size = [weightsWidth, weightsHeight],
padding = “SAME”,
kernel_regularizer = None,
trainable = False,
activation = None,
dtype = “float32”,
input_shape = image_shape)

model = Model (inputs, conv (inputs))

model.set_weights (weights)

image = np.expand_dims (image, axis = 0)
image = np.expand_dims (image, axis = 3)

out = model.predict (image)

out = out [0,:,:,(1,0)].ravel ()
print (out)

match = 0
noMatch = 0
for idx in range (len (control_answer)):
outElt = out [idx]
ctrlElt = control_answer [idx]
if outElt == ctrlElt:
match = match + 1
else:
noMatch = noMatch + 1

print (“match %s: , noMatch: %s” % (match, noMatch))

control.tar.gz (1.2 KB)