What is maximum buffer size for cudamemcpy(), can it be modified ?

I am having problem in transferring some data to GPU memory. The size of data is 2562568=524288 declared by float. It can transfer 131072 but not further. Can you please tell how can I transfer the data in full. I know I can divide in chunks but I need to get it in one go. My code is here.

float *fold = (float *) malloc (256*256*8 * sizeof(float));
    float *r_fold = (float *) malloc (256*256*8 * sizeof(float));
    for (int i = 0; i < (256*256*8) ; i++)
    {
       *(fold + i)  = i;
    } 
    float *d_fold;
    cudaMalloc((void**)&d_fold, 256*256*8);
    cudaMemcpy(d_fold, fold, 256*256*8, cudaMemcpyHostToDevice);
    dim3 grid(1, 100);
    dim3 block(100, 2);
    mem_read << <grid, block>> > (d_fold, tol);
    cudaMemcpy(r_fold, d_fold, 256*256*8, cudaMemcpyDeviceToHost);

There is no limit for the size of a cudaMemcpy.
However you need to consistently multiply the number of elements with the element size to find the number of bytes to transfer.
Your code multiplies by sizeof(float) in some places, but not in others.

I couldn’t execute the code and was unable to figure out the issue. When I use Cuda Debugging Mode and check memory d_fold in kernel then it only shows 131072 values and 0 after that. Which means I can only launch 8192 kernels. Here is my full code if you could figure out the problem.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <tchar.h>
#include <fstream>
#include <iostream>
#include <limits>
#include <stdlib.h>
#include <math.h>
#include <time.h>
using namespace std;
const int af = 2;
const int nc = 8;
const int nx = 256;
const int ny = 256;

__global__ void mem_read(double *d_fold, double tol)
{
   int c = blockIdx.x * blockDim.x + threadIdx.x;
   int r = blockIdx.y * blockDim.y + threadIdx.y;
   int idx = c + (r*blockDim.x);
   for (int i = 0; i < nc; i++)
   {
	for (int j = 0; j < af; j++)
	{
	   *(d_fold + idx*af*nc + i*af + j) = *(d_fold + idx*af*nc + i*af + j) + 2;
        }				
   }

}

int main()
{
   double *fold = (double *) malloc (nx*ny*nc * sizeof(double));
   double *r_fold = (double *) malloc (nx*ny*nc * sizeof(double));

   ifstream fold_read	("imfold.txt");
   double file_in = 0.0; 
   for (int i = 0; i < (nx*ny*nc) ; i++)
   {
	fold_read	>> file_in;		*(fold + i)  = file_in;
   } 
	
   double *d_fold;
	
   cudaMalloc((void**)&d_fold, nx*ny*nc);

   cudaMemcpy(d_fold, fold, nx*ny*nc, cudaMemcpyHostToDevice);

   dim3 grid(1, 32);
   dim3 block(32, 2);
   d_s_mem_real, d_s_mem_imag, nx, ny, nc, af, tol);
   mem_read << <grid, block>> > (d_fold, tol);
   cudaDeviceSynchronize();
   cudaMemcpy(r_fold, d_fold, nx*ny*nc, cudaMemcpyDeviceToHost);
	
   ofstream rp_mem_real_write, rp_mem_imag_write;

   rp_mem_real_write.open ("r_fold.txt");

   for (int i = 0; i< (nx*ny*nc); i++)
   {
      rp_mem_real_write << *(r_fold + i) << endl;
   }

   free (fold);
   free (r_fold);
   cudaFree (d_fold);
   return 0;

 
}

Thanks alot for the response. I got your point and problem is resolved :)

Regards,