This code to test the cublasSgemv routine on jetson nano inexplicably fails when the size is more. If I set M_ROWS = 2000 and M_COLUMNS = 1000 and V_ROWS = 1000, I get
File SgemvTester.cpp, Line 141: CUDA Runtime Error: unspecified launch failure
Otherwise it works,
const int M_ROWS = 200; /* C Rows */
const int M_COLUMNS = 100; /* C Columns */
const int V_ROWS = 100;
/**
* Initializes a fortran matrix using column wise
*/
void initializeFortranMatrix(float *M, const int N_Rows, const int N_Cols, int initVal)
{
for (int col = 0; col < N_Cols; col++)
{
for (int row = 0; row < N_Rows; row++)
{
M[col * N_Rows + row] = static_cast<float>(initVal);
}
}
}
void initializeVector(float *V, const int N_Rows, int initVal)
{
for (int row = 0; row < N_Rows; row++)
V[row] = static_cast<float>(initVal);
}
int main(int argc, char **argv)
{
float *M;
float *V;
float *Y;
cublasHandle_t handle;
checkCUBLAS(cublasCreate(&handle));
/* Allocate managed storage */
checkCuda(cudaMallocManaged(&M, sizeof(float) * M_ROWS * M_COLUMNS));
checkCuda(cudaMallocManaged(&V, sizeof(float) * V_ROWS));
checkCuda(cudaMallocManaged(&Y, sizeof(float) * V_ROWS));
initializeFortranMatrix(M, M_ROWS, M_COLUMNS, 11); /* Assuming column-wise storage */
initializeVector(V, V_ROWS, 1); /* Column */
initializeVector(Y, V_ROWS, 0); /* Column */
const float alpha = 1.0f;
const float beta = 0.0f;
checkCUBLAS(cublasSgemv(handle,
CUBLAS_OP_N,
M_ROWS, M_COLUMNS, &alpha, M, M_ROWS, V, 1, &beta, Y, 1));
checkCuda(cudaDeviceSynchronize());
checkCuda(cudaFree(M));
checkCuda(cudaFree(V));
checkCuda(cudaFree(Y));
checkCuda(cudaDeviceReset());
return 0;
}