Hello everyone,
I just started to implement a FIT TD solver with CUDA and some functionalities of cusparse are not that clear to me, yet.
To start with, the cusparseCreateCsr method takes only a few single integer, which are allocated on the host. The big data is allocated on the device (indices and values). The description of cusparseDestroyMat only states that it releases host memory, but it does so with the device memory as well, right? I am asking because I have splitted the generation of my final cusparseSpMatDescr_t object in in few methods. Each gets a reference to the cusparseSpMatDescr_t object and alters the entries in one way or enother. Since cusparse doesn’t provide an API for changing/deleting entries I usually create a new cusparseSpMatDescr_t object, delete the refered parameter via cusparseDestroyMat to prevent any memory leaks and assign the newly created object to the parameter. I will add an sendbox example below. Is there anything wrong with my way of handling the cusparseSpMatDescr_t objects? In my real implementation I get some inconsistent behaviour, thus I would like to know if I understood the memory handling properly.
Kind regards,
Jan
#include "CudaMemoryAnalysis.h"
#include <vector>
#include <iostream>
void main()
{
const int dimension(5);
try
{
CudaMatrixTest tester(dimension);
tester.AssembleMatrix();
}
catch (std::string error)
{
std::cout << error;
}
}
---------------------------------- CudaMemoryAnalysis.h -------------------------------------------------
#pragma once
#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc
#include <cusparse.h> // cusparseSpMV
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <vector>
class CudaMatrixTest
{
cusparseHandle_t _cusparseHandle;
int _systemMatrixSize;
cusparseIndexBase_t _idxBase;
cusparseSpMatDescr_t _matrixHandle;
inline void CheckCUDAError(cudaError_t status);
inline void CheckCUSPARSEError(cusparseStatus_t status);
void FillMatCOOFormatVectors(std::vector<int> & rowIndices, std::vector<int> & columnIndices, std::vector<float> & values);
void CreateCUDACSRMatrixHandleFromCOOData( std::vector<int> &rowsIndx, std::vector<int> &columnIndx, std::vector<float> &values, cusparseSpMatDescr_t & matrixHandle);
void DeleteZeroEntries(cusparseSpMatDescr_t & matrix);
public:
CudaMatrixTest(int matrixDimension)
{
_systemMatrixSize = matrixDimension;
CheckCUSPARSEError(cusparseCreate(&_cusparseHandle));
_idxBase = CUSPARSE_INDEX_BASE_ZERO;
}
void AssembleMatrix()
{
std::vector<int> rowIndices, columnIndices;
std::vector<float> values;
FillMatCOOFormatVectors(rowIndices, columnIndices, values);
CreateCUDACSRMatrixHandleFromCOOData(rowIndices, columnIndices, values, _matrixHandle);
ExportCUDAMatrix();
DeleteZeroEntries(_matrixHandle);
ExportCUDAMatrix();
}
void ExportCUDAMatrix();
};
inline void CudaMatrixTest::CheckCUDAError(cudaError_t status)
{
if (status != cudaSuccess)
{
std::string cudaErrorString = cudaGetErrorString(status);
std::string error = "CUDA API failed at line with error: " + cudaErrorString;
throw error;
}
}
inline void CudaMatrixTest::CheckCUSPARSEError(cusparseStatus_t status)
{
if (status != CUSPARSE_STATUS_SUCCESS)
{
std::string error = "CUDA API failed at line " + std::to_string(__LINE__) + " with error: " + std::string(cusparseGetErrorString(status)) + "\n";
throw error;
}
}
---------------------------------- CudaMemoryAnalysis.cpp -------------------------------------------------
#include "CudaMemoryAnalysis.h"
void CudaMatrixTest::FillMatCOOFormatVectors(std::vector<int> & rowIndices, std::vector<int> & columnIndices, std::vector<float> & values)
{
int nnz = 5;
rowIndices.reserve(nnz);
columnIndices.reserve(nnz);
values.reserve(nnz);
/*
All '-' symbolize values that are not set and therefore implicitly equal to zero.
1 - - 5 -
Matrix = - - - - -
- - 3 - -
- 0 - - -
- - - - 1
*/
rowIndices.push_back(0);
rowIndices.push_back(0);
rowIndices.push_back(2);
rowIndices.push_back(3);
rowIndices.push_back(4);
columnIndices.push_back(0);
columnIndices.push_back(3);
columnIndices.push_back(2);
columnIndices.push_back(1);
columnIndices.push_back(4);
values.push_back(1.f);
values.push_back(5.f);
values.push_back(3.f);
values.push_back(0.f);
values.push_back(1.f);
}
void CudaMatrixTest::CreateCUDACSRMatrixHandleFromCOOData(std::vector<int> &rowsIndx, std::vector<int> &columnIndx,std::vector<float> &values, cusparseSpMatDescr_t & matrixHandle)
{
int numberOfNotZeros = values.size();
//Create CUSPARSE matrix handle out of coo format indices of non zero entries
//First allocate device memory and copy content to device
int *d_columns(nullptr), *d_rows(nullptr);
float *d_values(nullptr);
CheckCUDAError(cudaMalloc(reinterpret_cast<void **>(&d_columns), numberOfNotZeros * sizeof(int)));
CheckCUDAError(cudaMalloc(reinterpret_cast<void **>(&d_rows), numberOfNotZeros * sizeof(int)));
CheckCUDAError(cudaMalloc(reinterpret_cast<void **>(&d_values), numberOfNotZeros * sizeof(float)));
CheckCUDAError(cudaMemcpy(d_columns, columnIndx.data(), numberOfNotZeros * sizeof(int), cudaMemcpyHostToDevice));
CheckCUDAError(cudaMemcpy(d_rows, rowsIndx.data(), numberOfNotZeros * sizeof(int), cudaMemcpyHostToDevice));
CheckCUDAError(cudaMemcpy(d_values, values.data(), numberOfNotZeros * sizeof(float), cudaMemcpyHostToDevice));
//Turn coo format in csr by compressing the row vector
int * d_csrRowIndices(nullptr);
int csrVectorSize = _systemMatrixSize + 1;
CheckCUDAError(cudaMalloc(reinterpret_cast<void **>(&d_csrRowIndices), csrVectorSize * sizeof(int)));//Actially less, but how could the size be calculated ? Buffer calculation?
/*
Attention: Even it is not specifically stated in the documentation, the vector that contains the compressed row indices has to be located in the device.
*/
CheckCUSPARSEError(cusparseXcoo2csr(_cusparseHandle, d_rows, numberOfNotZeros, _systemMatrixSize, d_csrRowIndices, _idxBase));
CheckCUSPARSEError(cusparseCreateCsr(&matrixHandle, _systemMatrixSize, _systemMatrixSize, numberOfNotZeros,
d_csrRowIndices, d_columns, d_values,
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
}
void CudaMatrixTest::DeleteZeroEntries(cusparseSpMatDescr_t & matrix)
{
//Export CSR format sparse matrix
int64_t numberOfRows, numberOfColumns, numberOfNonZeros;
cusparseIndexType_t indexType;
cusparseIndexBase_t indexBase;
cudaDataType valueType;
void * d_rowIndices, *d_colIumnIndices, *d_values;
CheckCUSPARSEError(cusparseCsrGet(matrix, &numberOfRows, &numberOfColumns, &numberOfNonZeros, &d_rowIndices, &d_colIumnIndices, &d_values, &indexType, &indexType, &indexBase, &valueType));
std::vector<int> h_rowIndices(numberOfRows + 1);
std::vector<int> h_colIumnIndices(numberOfNonZeros);
std::vector<float> h_values(numberOfNonZeros);
CheckCUDAError(cudaMemcpy(h_rowIndices.data(), d_rowIndices, (numberOfRows + 1) * sizeof(int), cudaMemcpyDeviceToHost));
CheckCUDAError(cudaMemcpy(h_colIumnIndices.data(), d_colIumnIndices, numberOfNonZeros * sizeof(int), cudaMemcpyDeviceToHost));
CheckCUDAError(cudaMemcpy(h_values.data(), d_values, numberOfNonZeros * sizeof(float), cudaMemcpyDeviceToHost));
//Count NNZ
int valuePointer(0), nnzCounter(0);
int currentRowStart, currentRowEnd;
for (int i = 0; i < numberOfRows; i++)
{
currentRowStart = h_rowIndices[i];
currentRowEnd = h_rowIndices[i + 1];
//Print all values contained in this row
for (int j = currentRowStart; j < currentRowEnd; j++)
{
if (h_values[valuePointer] != 0.)
{
nnzCounter++;
}
valuePointer++;
}
}
//Create vectors in coo format for creating a new matrix only with NZ entries
int nnzAmount = nnzCounter;
std::vector<int> coo_rowIndx(nnzCounter);
std::vector<int> coo_colIndx(nnzCounter);
std::vector<float> notZeroValues(nnzCounter);
valuePointer = 0;
nnzCounter = 0;
for (int i = 0; i < numberOfRows; i++)
{
currentRowStart = h_rowIndices[i];
currentRowEnd = h_rowIndices[i + 1];
//Print all values contained in this row
for (int j = currentRowStart; j < currentRowEnd; j++)
{
if (h_values[valuePointer] != 0.)
{
coo_rowIndx[nnzCounter] = i;
coo_colIndx[nnzCounter] = h_colIumnIndices[valuePointer];
notZeroValues[nnzCounter] = h_values[valuePointer];
nnzCounter++;
}
valuePointer++;
}
}
CheckCUSPARSEError(cusparseDestroySpMat(matrix));
cusparseSpMatDescr_t tempMat;
CreateCUDACSRMatrixHandleFromCOOData( coo_rowIndx, coo_colIndx, notZeroValues, tempMat);
matrix = tempMat;
}
void CudaMatrixTest::ExportCUDAMatrix()
{
cudaDeviceSynchronize();
int64_t numberOfRows, numberOfColumns, numberOfNonZeros;
cusparseIndexType_t indexType;
cusparseIndexBase_t indexBase;
cudaDataType valueType;
void * d_rowIndices, *d_colIumnIndices, *d_values;
CheckCUSPARSEError(cusparseCsrGet(_matrixHandle, &numberOfRows, &numberOfColumns, &numberOfNonZeros, &d_rowIndices, &d_colIumnIndices, &d_values, &indexType, &indexType, &indexBase, &valueType));
std::vector<int> h_rowIndices(numberOfRows + 1);
std::vector<int> h_colIumnIndices(numberOfColumns);
std::vector<float> h_values(numberOfNonZeros);
CheckCUDAError(cudaMemcpy(h_rowIndices.data(), d_rowIndices, (numberOfRows + 1) * sizeof(int), cudaMemcpyDeviceToHost));
CheckCUDAError(cudaMemcpy(h_colIumnIndices.data(), d_colIumnIndices, numberOfNonZeros * sizeof(int), cudaMemcpyDeviceToHost));
CheckCUDAError(cudaMemcpy(h_values.data(), d_values, numberOfNonZeros * sizeof(float), cudaMemcpyDeviceToHost));
}