Yes, the limitation still holds. It is an hardware limitation in the copy engine used in cudamemcpy2D.
These are two functions that I wrote to work around the issue, they are for double precision data but it is very simple to convert them to float:
void copyMatrixH2D (int rows, int cols,
const void *A, int lda, void *B, int ldb)
{
int i,n, nc, nb, bufferSize, lastNc;
int maxBufferSize=524288; // number of double precision elements in 4MB buffer
static int allocateBuffer=1;
static double *buffer;
if(allocateBuffer)
{
#ifdef PAGEABLE
buffer =(double *)malloc(maxBufferSize*8);
#else
cudaMallocHost((void **) &buffer, maxBufferSize*8);
#endif
allocateBuffer=0;
}
/* Find out how many columns of data fit in the buffer */
nc= maxBufferSize/rows;
nc = imin(nc, cols);
bufferSize= nc*rows*8;
/* Find out how many times we need to copy data to the buffer */
nb= cols/nc;
for (n=0; n <nb; n++)
{
/* Copy nc colums from host matrix to the buffer */
for (i=0; i<nc; i++)
{
memcpy (buffer+i*rows, (double *)A+lda*(i+n*nc), 8*rows);
}
/* Copy the buffer to the device matrix */
cudaMemcpy((double *)B+ldb*n*nc, buffer, bufferSize, cudaMemcpyHostToDevice);
}
lastNc = cols - nc*nb;
if (lastNc == 0) return;
/* Copy last colums from host matrix to the buffer */
bufferSize= lastNc*rows*8;
for (i=0; i<lastNc; i++)
memcpy (buffer+i*rows, (double *)A+lda*(i+nb*nc), 8*rows);
/* Copy the buffer to the device matrix */
cudaMemcpy((double *)B+ldb*nb*nc, buffer, bufferSize, cudaMemcpyHostToDevice);
}
void copyMatrixD2H (int rows, int cols,
const void *A, int lda, void *B, int ldb)
{
int i,n, nc, nb, bufferSize, lastNc;
int maxBufferSize=524288; // number of double precision elements in 4MB buffer
static int allocateBuffer=1;
static double *buffer;
if(allocateBuffer)
{
#ifdef PAGEABLE
buffer =(double *)malloc(maxBufferSize*8);
#else
cudaMallocHost((void **) &buffer, maxBufferSize*8);
#endif
allocateBuffer=0;
}
/* Find out how many columns of data fit in the buffer */
nc= maxBufferSize/rows;
nc = imin(nc, cols);
bufferSize= nc*rows*8;
/* Find out how many times we need to copy data to the buffer */
nb= cols/nc;
for (n=0; n <nb; n++)
{
/* Copy the device matrix to the buffer */
cudaMemcpy(buffer,(double *)A+lda*n*nc, bufferSize, cudaMemcpyDeviceToHost);
/* Copy nc colums from the buffer to host matrix */
for (i=0; i<nc; i++)
{
memcpy ((double *)B+ldb*(i+n*nc),buffer+i*rows, 8*rows);
}
}
lastNc = cols - nc*nb;
if (lastNc == 0) return;
/* Copy last colums from device matrix to the buffer */
bufferSize= lastNc*rows*8;
/* Copy the buffer to the device matrix */
cudaMemcpy(buffer,(double *)A+lda*nb*nc, bufferSize, cudaMemcpyDeviceToHost);
for (i=0; i<lastNc; i++)
memcpy ((double *)B+ldb*(i+nb*nc),buffer+i*rows, 8*rows);
}
The usage is similar to S/GetMatrix (the limitations in this example are for double)
if ( (*lda) > 32768 || m_gpu >32768)
{
copyMatrixH2D (m_gpu, k_gpu, A, *lda, devPtrA, m_gpu);
}
else
{
status = cublasSetMatrix (m_gpu, k_gpu, sizeof(A[0]), A, *lda, devPtrA, m_gpu);
if (status != CUBLAS_STATUS_SUCCESS) {
printf ( "!!!! device access error (write A ) %d\n", status);
}
}
Hope this help.