Hi @Robert_Crovella , thanks for your kind reply.
Since the actual code is a bunch of abstraction and templates which is difficult to share, I’ve made a simple snippet that resembles what I have in my code where the error occurs. I’ve omitted unnecessary “checkCudaError” codes for clarity. Please have a look.
using Scalar = double;
using ScalarPtr = Scalar*;
using Index = size_t;
using IntPtr = int*;
void device_assign(ScalarPtr dResult, Scalar val) noexcept
{
cudaMemcpy(dResult, &val, sizeof(Scalar), cudaMemcpyHostToDevice);
}
struct CudaContext
{
cublasHandle_t cublasHandle{nullptr};
cusparseHandle_t cusparseHandle{nullptr};
};
struct SparseMatrixShape
{
Index nRows{ 0 }; /*!< Number of rows*/
Index nColumns{ 0 }; /*!< Number of columns*/
Index nNonZeros{ 0 }; /*!< Number of non-zero entries*/
};
class SparseMatrixCsr
{
public:
void toDeviceSparseMatrix()
{
// Allocate device variables
IntPtr device_csrRowOffsets{nullptr};
IntPtr device_csrColumnIndex{nullptr};
ScalarPtr device_csrValues{nullptr};
cudaMalloc(reinterpret_cast<VoidPtr *>(&device_csrRowOffsets), sizeof(int) * (shape_.nRows + 1));
cudaMalloc(reinterpret_cast<VoidPtr *>(&device_csrColumnIndex), sizeof(int) * shape_.nNonZeros);
cudaMalloc(reinterpret_cast<VoidPtr *>(&device_csrValues), sizeof(Scalar) * shape_.nNonZeros);
// Copy from host to device
cudaMemcpy(device_csrRowOffsets, csrRowOffsets_.data(), sizeof(int) * (shape_.nRows + 1), cudaMemcpyHostToDevice);
cudaMemcpy(device_csrColumnIndex, csrColumnIndex_.data(), sizeof(int) * shape_.nNonZeros, cudaMemcpyHostToDevice);
cudaMemcpy(device_csrValues, csrValues_.data(), sizeof(Scalar) * shape_.nNonZeros, cudaMemcpyHostToDevice);
cusparseIndexBase_t cudaIndexBase = indexBase_ ? CUSPARSE_INDEX_BASE_ONE : CUSPARSE_INDEX_BASE_ZERO;
// Build sparse matrix by cuSPARSE
cusparseStatus_t cudaStatus = cusparseCreateCsr(
&deviceSparseMatrix_,
shape_.nRows,
shape_.nColumns,
shape_.nNonZeros,
device_csrRowOffsets,
device_csrColumnIndex,
device_csrValues,
CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_32I,
cudaIndexBase,
CUDA_R_64F);
}
auto getDeviceSparseMatrix() const noexcept { return deviceSparseMatrix_; }
// other methods...
private:
SparseMatrixShape shape_;
cusparseSpMatDescr_t deviceSparseMatrix_{ nullptr }; /*!< cuSparse spMatrix descriptor on device*/
int indexBase_{ 1 }; /*!< Sparse matrix index base*/
std::span<int> csrRowOffsets_{}; /*!< Row offsets array, size = nRows_ + 1*/
std::span<int> csrColumnIndex_{}; /*!< Column index array, size = nNonZeros_*/
std::span<Scalar> csrValues_{}; /*!< Value array, size = nNonZeros_*/
// other members...
}
void run_SpMV()
{
// Initialize cusparse context etc.
CudaContext context;
cudaStream_t stream{nullptr};
cudahelper::createCuBLASContext(context.cublasHandle);
cudahelper::createCuSPARSEContext(context.cusparseHandle);
cudaStreamCreate(&stream);
cublasStatus_t cublasStatus = cublasSetStream(context.cublasHandle, stream);
cusparseStatus_t cusparseStatus = cusparseSetStream(context.cusparseHandle, stream);
cublasSetPointerMode(context.cublasHandle, CUBLAS_POINTER_MODE_DEVICE);
cusparseSetPointerMode(context.cusparseHandle, CUSPARSE_POINTER_MODE_DEVICE);
// Allocate constants on device
ScalarPtr device_constants{nullptr};
ScalarPtr device_one{nullptr};
ScalarPtr device_minusOne{nullptr};
ScalarPtr device_zero{nullptr};
cudaMalloc(&device_constants, 3 * sizeof(Scalar));
device_one = device_constants++;
device_zero = device_constants++;
device_minusOne = device_constants++;
device_constants = device_one;
device_assign(device_one, 1.0);
device_assign(device_zero, 0.0);
device_assign(device_minusOne, -1.0);
// Initialize matrixA ...
SparseMatrixCsr matrixA;
/*
... read A from files etc.
*/
matrixA.toDeviceSparseMatrix();
// Create host vector x and vector y
std::vector<double> host_x;
std::vector<double> host_Y;
Index size{matrixA.nColumns};
host_y.assign(size, 0.0);
/*
... assign values to host_x from files etc.
*/
// Create device vector device_x and vector device_y
cusparseDnVecDescr_t device_x{nullptr};
cusparseDnVecDescr_t device_y{nullptr};
// Allocate device variables
ScalarPtr device_dense_x{nullptr};
ScalarPtr device_dense_y{nullptr};
cudaMalloc(reinterpret_cast<VoidPtr *>(&device_dense_x), sizeof(Scalar) * size);
cudaMalloc(reinterpret_cast<VoidPtr *>(&device_dense_y), sizeof(Scalar) * size);
// Copy from host to device
cudaMemcpy(device_dense_x, host_x.data(), sizeof(Scalar) * size, cudaMemcpyHostToDevice);
cudaMemcpy(device_dense_y, host_y.data(), sizeof(Scalar) * size, cudaMemcpyHostToDevice);
// Build dense vector by cuSPARSE
cusparseCreateDnVec(&device_x, size, device_dense_x, CUDA_R_64F);
cusparseCreateDnVec(&device_y, size, device_dense_x, CUDA_R_64F);
// Make buffer for SpMV
VoidPtr device_buffer{nullptr};
size_t bufferSizeMV{0};
cusparseStatus_t cusparseStatus = cusparseSpMV_bufferSize(
cusparseHandle,
CUSPARSE_OPERATION_NON_TRANSPOSE,
device_one,
matrixA.getDeviceSparseMatrix(),
device_x,
device_zero,
device_y,
CUDA_R_64F,
CUSPARSE_SPMV_ALG_DEFAULT,
&bufferSizeMV);
cudaMalloc(&device_buffer, bufferSizeMV);
// Excecute SpMV
auto cusparseStatus = cusparseSpMV(
cusparseHandle,
CUSPARSE_OPERATION_NON_TRANSPOSE,
device_one,
matrixA.getDeviceSparseMatrix(),
device_x,
device_zero,
device_y,
CUDA_R_64F,
CUSPARSE_SPMV_ALG_DEFAULT,
device_buffer);
// Free resources
// ..
}
The error I got from compute-sanitizer is as follows. It doesn’t show “address xxxxxx is misaligned”. I doubt that the part of code that allocates the constants on the device via pointer arithmetic is maybe wrong. What do you think? And the array allocated by cudaMalloc
is guaranteed to be aligned, right? Thanks.
========= Invalid __global__ read of size 4 bytes
========= at void cusparse::csrmv_v3_kernel<std::integral_constant<bool, (bool)0>, int, int, double, double, double, double, void>
(cusparse::KernelCoeffs<T7>, const T3 *, const T2 *, const T3 *, const T4 *, T2, int, int, const T5 *, T6 *, T3 *, T7 *)+0x21b0
========= by thread (0,0,0) in block (64,0,0)
========= Address 0xc083678e8 is out of bounds
========= and is 185 bytes after the nearest allocation at 0xc08367800 of size 48 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:cuEventRecordWithFlags [0x7ffcbc5edc34]
========= in C:\WINDOWS\system32\DriverStore\FileRepository\nvblwi.inf_amd64_77518d2b617eecb2\nvcuda64.dll
========= Host Frame:cusparseSpruneCsr2csr_bufferSizeExt [0x7ffc83cbe003]
========= in C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin\cusparse64_12.dll
========= Host Frame:cusparseSpruneCsr2csr_bufferSizeExt [0x7ffc83cbdec6]
========= in C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin\cusparse64_12.dll
========= Host Frame:cusparseSpruneCsr2csr_bufferSizeExt [0x7ffc83cbc237]
========= in C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin\cusparse64_12.dll
========= Host Frame:cusparseZgemvi_bufferSize [0x7ffc8356accf]
========= in C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin\cusparse64_12.dll
========= Host Frame:cusparseZgemvi_bufferSize [0x7ffc835563db]
========= in C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin\cusparse64_12.dll
========= Host Frame:cusparseZgemvi_bufferSize [0x7ffc83573729]
========= in C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin\cusparse64_12.dll
========= Host Frame:cusparseZgemvi_bufferSize [0x7ffc83577d51]
========= in C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin\cusparse64_12.dll
========= Host Frame:cusparseSpMV [0x7ffc834b3d46]
========= in C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin\cusparse64_12.dll
========= Host Frame:cudasolver::utility::cuSparseSpmv<cudasolver::SparseMatrixCsr<std::vector<int,std::allocator<int> >,
std::vector<double,std::allocator<double> > >,cusparseDnVecDescr *,cusparseDnVecDescr *,double *> in \some_path\include\utility\cuutility.cuh:347 [0x84226]