Data transfer failed

Hi, I am using this code to test transfer velocity and optimizations but I don’t understand why if the data is more than 1340 MB the transfer fail but CUDA doesn’t report any error. I verify the transfer with a comparison between send data and received data.

int main()
{
  unsigned int nElements = 400*1024*1024;
  const unsigned int bytes = nElements * sizeof(float);

  // Declarar variables de HOST
  float *Host_enviado, *Host_recibido;

  // Declarar variables de DEVICE
  float *device_dato;

  // Reservar la memoria
  Host_enviado = (float*)malloc(bytes);                    // hos
  Host_recibido = (float*)malloc(bytes);                   // host
  checkCuda( cudaMalloc((void**)&device_dato, bytes) );    // device

  //Inicializar
  for (int i = 0; i < nElements; ++i) Host_enviado[i] = i;
  memset(Host_recibido, 0, bytes);

  // output device info and transfer size
  cudaDeviceProp prop;
  checkCuda( cudaGetDeviceProperties(&prop, 0) );

  printf("\nDevice: %s\n", prop.name);
  printf("Transfer size (MB): %d\n", bytes / (1024 * 1024));

  checkCuda( cudaMemcpy(device_dato, Host_enviado, bytes, cudaMemcpyHostToDevice) );

  checkCuda( cudaMemcpy(Host_recibido, device_dato, bytes, cudaMemcpyDeviceToHost) );
  for (int i = 0; i < nElements; ++i) {
    if (Host_enviado[i] != Host_recibido[i]) {
      printf("*** transfers failed ***");
      break;
    }
  }

  printf("\n");

  // liberar la memoria reservadas
  cudaFree(device_dato);
  free(Host_enviado);
  free(Host_recibido);

  return 0;
}

Could anyone understand this problem and help me? Thanks very much.