I have been using cublasSgemm quite successfully for multiplying square matrices. But, recently I observed that if the number of rows or columns increases beyond 269 (i.e. 270 x 270 matrices and above), I begin to get “Memory Access Violations”, when I debug by enabling Nsight Cuda Memory Checker. If I do not enable memory checker then there are no exceptions and the results are also correct.
Following is the exact error message
Memory Checker detected 64 access violations
access violations on store (global memory)
Is it a limitation of my gpu or the cublasSgemm function? What can I do to resolve this issue?
I am using Cuda 6.5 with MS Visual Studio 2012 on Quadro FX 1800M (sm_12). OS is MS Windows 7 64-bit. TDR is set to 60 seconds
I am including a stripped down version of the code below
#include <stdio.h>
#include <cuda.h>
#include <cublas_v2.h>
int main(int argc, char **argv)
{
const int m = 269; // for 1 - 269 there are no access violations
// but as soon as m >= 270 Memory Checker throws memory access violations
// Note: the results are correct even with these violations
float X = new float[mm];
float Y = new float[mm];
float Z = new float[mm];
float *devX, *devY, *devZ;
cublasHandle_t handle;
cudaError_t err;
cublasStatus_t err1;
//simple initialization
for(unsigned long i = 0; i < m*m; i++)
{
X[i] = 1;
Y[i] = 2;
}
err1 = cublasCreate(&handle);
if(err1 != CUBLAS_STATUS_SUCCESS)
return 1;
err = cudaMalloc((void **)&devX, mmsizeof(*devX));
if(err != CUBLAS_STATUS_SUCCESS)
return 1;
err = cudaMalloc((void **)&devY, mmsizeof(*devY));
if(err != CUBLAS_STATUS_SUCCESS)
return 1;
err = cudaMalloc((void **)&devZ, mmsizeof(*devZ));
if(err != CUBLAS_STATUS_SUCCESS)
return 1;
err1 = cublasSetMatrix(m, m, sizeof(*X), X, m, devX, m);
if(err1 != CUBLAS_STATUS_SUCCESS)
return 1;
err1 = cublasSetMatrix(m, m, sizeof(*Y), Y, m, devY, m);
if(err1 != CUBLAS_STATUS_SUCCESS)
return 1;
////////////////////////////////////////////////////////////
printf(“Reached sgemm without error\n”);
const float alpha = 1.0f, beta = 0.0f;
// cuda memory checker detects access violations when m > 269
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, m, m, &alpha, devX, m, devY, m, &beta, devZ, m);
cudaDeviceSynchronize();
printf(“reached after sgemm without error\n”);
////////////////////////////////////////////////////////////
err1 = cublasGetMatrix(m, m, sizeof(*devZ), devZ, m, Z, m);
if(err != CUBLAS_STATUS_SUCCESS)
return 1;
// just printing a single element for brevity
printf(“…%f…”, Z[0]);
cudaFree(devX);
cudaFree(devY);
cudaFree(devZ);
cublasDestroy_v2(handle);
getchar();
return 0;
}