Im working with nvprof/NVIDIA Visual profiler on windows to profile a Matrix-vector multiplication(GEMV), and when I generate the .exe file(Cuda application binary) with nvcc(command: nvcc kernel.cu -o naive_gemv.exe -ccbin “C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.34.31933\bin” -lcudart -lcuda), the exe file generated does not execute the kernels apparently, cause when I execute the .exe I got just zeros for the result of the multiplication, I used Visual studio 2022 to code and run the program and in it I got the results without errors, but not when I generate the .exe file. Besides, when I profile that .exe with NVIDIA visual profiler, I dont see results for the kernels.
How can I solve this problem, am I missing something while generating the .exe(binary) file?
This is my code:
#ifndef __CUDACC__
#define __CUDACC__
#endif
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <math.h>
#include <string>
#include <vector>
#include "cublas_v2.h"
using namespace std;
// kernel function where each thread performs matrix-vector multiplication
// for their corresponding element of the result vector
__global__ void multiplyMV(double* matrix, double* vector, double* result, int N)
{
int offset = blockIdx.x * blockDim.x;
int row = offset + threadIdx.x;
if (row < N)
{
for (int i = 0; i < N; i++)
{
result[row] += matrix[row * N + i] * vector[i];
}
}
}
// Returns a random double between 0.01 and 10
double generateRandDouble()
{
return 10 * (double)rand() / (double)RAND_MAX + 0.01;
}
// Populates given array with random doubles
// array's length must match the passed in size parameter
void fillRandom(double* arr, int N)
{
for (int i = 0; i < N; i++)
{
arr[i] = generateRandDouble();
}
}
// Populates given array with zeros (empty array)
// array's length must match the passed in size parameter
void fillDefault(double* arr, int N)
{
for (int i = 0; i < N; i++)
{
arr[i] = 0;
}
}
// Subtracts the result vector array from blasResult vector array
// Stores the calculated difference in the residual vector array
// result, blasResult, and residual arrays' lenghths must match the passed in
// N parameter
void calcResidual(double* result, double* blasResult, double* residual, int N)
{
for (int i = 0; i < N; i++)
{
residual[i] = blasResult[i] - result[i];
}
}
bool isResidualSmall(double* residual, int N)
{
for (int i = 0; i < N; i++)
{
if (residual[i] > 0.0001)
{
return false;
}
}
return true;
}
void printConfig(int N, int numBlocks, int numThreads)
{
cout << "Execution Configuration:" << endl;
cout << "N = " << N << ", "
<< "Number of Blocks = " << numBlocks << ", "
<< "Number of Threads Per Block = " << numThreads << endl;
cout << endl;
}
// Print's matrix array's elements
// If matrix array has too many elements, then truncated matrix is printed
// matrix array is 1D array not a 2D array but logically represents a 2d
// array such that each row is stored in order
// matrix array's length is N * N where N is the passed in parameter
// representing number of rows (same as number of columns)
void printMatrix(double* matrix, int N)
{
bool tooLarge = false;
// check if matrix is too large
if (N > 10)
{
N = 10;
tooLarge = true;
}
// print matrix name
cout << "Matrix" << endl;
// print matrix elements
for (int j = 0; j < N; j++)
{
for (int k = 0; k < N; k++)
{
cout << matrix[j * N + k] << " ";
}
// add ellipsis to represent truncation if matrix too large
if (tooLarge)
{
cout << " ...";
}
cout << endl;
}
// add ellipsis to represent truncation if matrix too large
if (tooLarge)
{
cout << " ..." << endl;
}
cout << endl;
}
// prints the elements in the given array vec
// array vec's length much match N where N
// is the number of elements in array
// Turncates array if too large
void printVec(double* vec, int N, string name)
{
bool tooLarge = false;
if (N > 100)
{
tooLarge = true;
N = 100; // truncate array to 100 elements
}
cout << name << " :" << endl;
for (int i = 0; i < N; i++)
{
cout << vec[i] << " ";
}
if (tooLarge)
{
cout << " ... "; // elipsis represents truncated
}
cout << endl << endl;
}
// sets up the execution configuration in the configs
// each element in configs is a vector where
// 1st element is number of elements,
// 2nd element is number of blocks, and
// 3rd element is number of threads per block
void setUpConfigs(vector<vector<int>>& configs)
{
// 1024 elements, 5 blocks, and 205 threads per block
vector<int> config1 = { 1024, 5, 205 };
configs.push_back(config1);
// 4095 elements, 12 blocks, 342 threads per block
vector<int> config2 = { 4095, 12, 342 };
configs.push_back(config2);
// 12 elements, 12 blocks, 1 threads per block
vector<int> config3 = { 12, 12, 1 };
configs.push_back(config3);
//8190/13 =630
vector<int> config4 = { 8190, 13, 630 };
configs.push_back(config4);
//11585/200=58
vector<int> config5 = { 11585, 200, 58 };
configs.push_back(config5);
}
int main(int argc, char* argv[])
{
// Set up execution configurations
vector<vector<int>> configs;
setUpConfigs(configs);
// host copies of matrix, vector, result, blasResult, and residual
double* matrix, * vector, * result, * blasResult, * residual;
// device copies of matrix, vector, result
double* d_matrix, * d_vector, * d_result;
for (int i = 0; i < configs.size(); i++)
{
// get execution configuration
int N = configs[i][0];
int numBlocks = configs[i][1];
int numThreads = configs[i][2];
// allocate memory on host
int vectorSize = N * sizeof(double);
int matrixSize = N * N * sizeof(double);
matrix = (double*)malloc(matrixSize);
fillRandom(matrix, N * N);
vector = (double*)malloc(vectorSize);
fillRandom(vector, N);
result = (double*)malloc(vectorSize);
blasResult = (double*)malloc(vectorSize);
residual = (double*)malloc(vectorSize);
// allocate memory on device
cudaMalloc((void**)&d_matrix, matrixSize);
cudaMalloc((void**)&d_vector, vectorSize);
cudaMalloc((void**)&d_result, vectorSize);
// Copy inputs to device
cudaMemcpy(d_matrix, matrix, matrixSize, cudaMemcpyHostToDevice);
cudaMemcpy(d_vector, vector, vectorSize, cudaMemcpyHostToDevice);
// lauch kernel function
multiplyMV << <numBlocks, numThreads >> > (d_matrix, d_vector, d_result, N);
// Copy result back to host
cudaMemcpy(result, d_result, vectorSize, cudaMemcpyDeviceToHost);
// print all data
printConfig(N, numBlocks, numThreads);
printMatrix(matrix, N);
printVec(vector, N, "Vector");
printVec(result, N, "Kernel Result");
// free memory on device
cudaFree(d_matrix);
cudaFree(d_vector);
cudaFree(d_result);
// free memory on host
free(matrix);
free(vector);
free(result);
free(blasResult);
free(residual);
}
return 0;
}
No problems when the program .cu is executed in visual studio 2022:
No observable kernel on NVIDIA Visual profiler:
This is how I created the binary file .exe: