how to use all global memory in program cuda

this is my code of my file example.cu
I have a graphic card GeForce GT 640
in your specification show me 2Gb global memory
but I use the metric of my first array
1500015000 -> dimension array
4 -> array type int (4 bytes)
2 -> two copies to device
15000
1500042 = 1800000000 bytes
and I have error cudaMemcpyDeviceToHost failed (11) invalid arguments
also I use the metric of my second array
85008500 -> dimension array
4 -> array type int (4 bytes)
6 -> two copies to device
8500
850046 = 1734000000 bytes
and I have the same error cudaMemcpyDeviceToHost failed (11) invalid arguments

#include <stdio.h>
#include <time.h>

#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

#define N2_1D (15000 * 15000)
#define N2_X (15000)
#define N2_Y (15000)
#define N6_1D (8500 * 8500)
#define N6_X (8500)
#define N6_Y (8500)

#define DIM (32 * 16)
#define DIM_X (32)
#define DIM_Y (16)

//device int atomicAdd(int *address, int val);

global void add2_1D( int *in, int *out, int dim ) {
//unsigned int index = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int index = gridDim.x * blockDim.x * (blockDim.x * blockIdx.x + threadIdx.x);
if (index < dim) {
out[index] = in[index];
}
/while (index < dim) {
out[index] = in[index];
index += blockDim.x * gridDim.x;
}
/
}
global void add2_2D( int *in, int *out, int dim ) {
unsigned int thread = blockDim.x * threadIdx.y + threadIdx.x;
unsigned int block = gridDim.x * blockIdx.y + blockIdx.x;
unsigned int bDim = blockDim.x * blockDim.y;
unsigned int index = bDim * block + thread;
//unsigned int indexX = blockDim.x * blockIdx.x + threadIdx.x;
//unsigned int indexY = blockDim.y * blockIdx.y + threadIdx.y;
//unsigned int index = gridDim.x * blockDim.x * indexY + indexX;
if (index < dim) {
out[index] = in[index];
}
}
global void add6_1D( int *in1, int *in2, int *in3, int *out1, int *out2, int *out3, int dim ) {
//unsigned int index = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int index = gridDim.x * blockDim.x * (blockDim.x * blockIdx.x + threadIdx.x);
if (index < dim) {
out1[index] = in1[index];
out2[index] = in2[index];
out3[index] = in3[index];
}
/while (index < dim) {
out1[index] = in1[index];
out2[index] = in2[index];
out3[index] = in3[index];
index += blockDim.x * gridDim.x;
}
/
}
global void add6_2D( int *in1, int *in2, int *in3, int *out1, int *out2, int *out3, int dim ) {
//unsigned int thread = blockDim.x * threadIdx.y + threadIdx.x;
//unsigned int block = gridDim.x * blockIdx.y + blockIdx.x;
//unsigned int bDim = blockDim.x * blockDim.y;
//unsigned int index = bDim * block + thread;
unsigned int indexX = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int indexY = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int index = gridDim.x * blockDim.x * indexY + indexX;
if (index < dim) {
out1[index] = in1[index];
out2[index] = in2[index];
out3[index] = in3[index];
}
}

void checkCUDAErrors(cudaError_t, const char*);

#define HANDLE_ERROR( err ) (HandleError( err, FILE, LINE ))
static void HandleError( cudaError_t err, const char *file, int line ) {
if (err != cudaSuccess) {
printf( “%s in %s at line %d\n”, cudaGetErrorString( err ), file, line );
exit( EXIT_FAILURE );
}
}

int main( int argc, char *argv ) {
cudaError_t cudaStatus;

int *host_in1, *dev_in1;
int *host_out1, *dev_out1;

// allocate the memory on the CPU
host_in1   = (int*)malloc( N2_1D * sizeof(int) );
host_out1  = (int*)malloc( N2_1D * sizeof(int) );

// fill the arrays 'a' and 'b' on the CPU
for (int i=0; i<N2_1D; i++) {
	host_in1[i] = i;
}

// allocate the memory on the GPU
cudaStatus = cudaMalloc( (void**)&dev_in1, N2_1D * sizeof(int) );
checkCUDAErrors(cudaStatus,"cudaMalloc");
cudaStatus = cudaMalloc( (void**)&dev_out1, N2_1D * sizeof(int) );
checkCUDAErrors(cudaStatus,"cudaMalloc");

// copy the arrays to the GPU
cudaStatus = cudaMemcpy( dev_in1, host_in1, N2_1D * sizeof(int), cudaMemcpyHostToDevice );
checkCUDAErrors(cudaStatus,"cudaMemcpyHostToDevice");

//execute the kernel	
//dim3 blockspergrid, threadsperblock;
//threadsperblock.x = DIM;
//blockspergrid.x = (N2_1D + (DIM - 1)) / DIM;
//printf( "blockspergrid %d threadsperblock %d\n", blockspergrid.x, threadsperblock.x);
//add2_1D<<<blockspergrid, threadsperblock>>>( dev_in1, dev_out1, N2_1D );

//execute the kernel	
dim3 blockspergrid, threadsperblock;
threadsperblock.x = DIM_X;				threadsperblock.y = DIM_Y;
blockspergrid.x = (N2_X + (DIM_X - 1)) / DIM_X;		blockspergrid.y = (N2_Y + (DIM_Y - 1)) / DIM_Y;
printf( "blockspergrid %d %d threadsperblock %d %d\n", blockspergrid.x, blockspergrid.y, threadsperblock.x, threadsperblock.y);
add2_2D<<<blockspergrid, threadsperblock>>>( dev_in1, dev_out1, N2_1D );

//cudaStatus = cudaThreadSynchronize();
//checkCUDAErrors(cudaStatus,"cudaThreadSynchronize");
cudaStatus = cudaDeviceSynchronize();
checkCUDAErrors(cudaStatus,"cudaDeviceSynchronize");

// copy the array back from the GPU to the CPU
cudaStatus = cudaMemcpy( host_out1, dev_out1, N2_1D * sizeof(int), cudaMemcpyDeviceToHost );
checkCUDAErrors(cudaStatus,"cudaMemcpyDeviceToHost");

// free the memory we allocated on the GPU
HANDLE_ERROR( cudaFree( dev_in1 ) );
HANDLE_ERROR( cudaFree( dev_out1 ) );

// free the memory we allocated on the CPU
free( host_in1 );
free( host_out1 );

printf("\nPress any key to continue...\n");
getchar();
getchar();

return 0;

}

/*
int main( int argc, char *argv ) {
cudaError_t cudaStatus;

int *host_in1, *dev_in1;
int *host_in2, *dev_in2;
int *host_in3, *dev_in3;
int *host_out1, *dev_out1;
int *host_out2, *dev_out2;
int *host_out3, *dev_out3;

// allocate the memory on the CPU
host_in1   = (int*)malloc( N6_1D * sizeof(int) );
host_out1  = (int*)malloc( N6_1D * sizeof(int) );
host_in2   = (int*)malloc( N6_1D * sizeof(int) );
host_out2  = (int*)malloc( N6_1D * sizeof(int) );
host_in3   = (int*)malloc( N6_1D * sizeof(int) );
host_out3  = (int*)malloc( N6_1D * sizeof(int) );

// fill the arrays 'a' and 'b' on the CPU
for (int i=0; i<N6_1D; i++) {
	host_in1[i] = i;
	host_in2[i] = i;
	host_in3[i] = i;
}

// allocate the memory on the GPU
cudaStatus = cudaMalloc( (void**)&dev_in1, N6_1D * sizeof(int) );
checkCUDAErrors(cudaStatus,"cudaMalloc");
cudaStatus = cudaMalloc( (void**)&dev_out1, N6_1D * sizeof(int) );
checkCUDAErrors(cudaStatus,"cudaMalloc");
cudaStatus = cudaMalloc( (void**)&dev_in2, N6_1D * sizeof(int) );
checkCUDAErrors(cudaStatus,"cudaMalloc");
cudaStatus = cudaMalloc( (void**)&dev_out2, N6_1D * sizeof(int) );
checkCUDAErrors(cudaStatus,"cudaMalloc");
cudaStatus = cudaMalloc( (void**)&dev_in3, N6_1D * sizeof(int) );
checkCUDAErrors(cudaStatus,"cudaMalloc");
cudaStatus = cudaMalloc( (void**)&dev_out3, N6_1D * sizeof(int) );
checkCUDAErrors(cudaStatus,"cudaMalloc");

// copy the arrays to the GPU
cudaStatus = cudaMemcpy( dev_in1, host_in1, N6_1D * sizeof(int), cudaMemcpyHostToDevice );
checkCUDAErrors(cudaStatus,"cudaMemcpyHostToDevice");
cudaStatus = cudaMemcpy( dev_in2, host_in2, N6_1D * sizeof(int), cudaMemcpyHostToDevice );
checkCUDAErrors(cudaStatus,"cudaMemcpyHostToDevice");
cudaStatus = cudaMemcpy( dev_in3, host_in3, N6_1D * sizeof(int), cudaMemcpyHostToDevice );
checkCUDAErrors(cudaStatus,"cudaMemcpyHostToDevice");

//execute the kernel	
//dim3 blockspergrid, threadsperblock;
//threadsperblock.x = DIM;
//blockspergrid.x = (N6_1D + (DIM - 1)) / DIM;
//printf( "blockspergrid %d threadsperblock %d\n", blockspergrid.x, threadsperblock.x);
//add6_1D<<<blockspergrid, threadsperblock>>>( dev_in1, dev_in2, dev_in3, dev_out1, dev_out2, dev_out3, N6_1D );

//execute the kernel	
dim3 blockspergrid, threadsperblock;
threadsperblock.x = DIM_X;				threadsperblock.y = DIM_Y;
blockspergrid.x = (N6_X + (DIM_X - 1)) / DIM_X;		blockspergrid.y = (N6_Y + (DIM_Y - 1)) / DIM_Y;
printf( "blockspergrid %d %d threadsperblock %d %d\n", blockspergrid.x, blockspergrid.y, threadsperblock.x, threadsperblock.y);
add6_2D<<<blockspergrid, threadsperblock>>>( dev_in1, dev_in2, dev_in3, dev_out1, dev_out2, dev_out3, N6_1D );

cudaStatus = cudaThreadSynchronize();
checkCUDAErrors(cudaStatus,"cudaThreadSynchronize");
//cudaStatus = cudaDeviceSynchronize();
//checkCUDAErrors(cudaStatus,"cudaDeviceSynchronize");

// copy the array back from the GPU to the CPU
cudaStatus = cudaMemcpy( host_out1, dev_out1, N6_1D * sizeof(int), cudaMemcpyDeviceToHost );
checkCUDAErrors(cudaStatus,"cudaMemcpyDeviceToHost");
cudaStatus = cudaMemcpy( host_out2, dev_out2, N6_1D * sizeof(int), cudaMemcpyDeviceToHost );
checkCUDAErrors(cudaStatus,"cudaMemcpyDeviceToHost");
cudaStatus = cudaMemcpy( host_out3, dev_out3, N6_1D * sizeof(int), cudaMemcpyDeviceToHost );
checkCUDAErrors(cudaStatus,"cudaMemcpyDeviceToHost");

// free the memory we allocated on the GPU
HANDLE_ERROR( cudaFree( dev_in1 ) );
HANDLE_ERROR( cudaFree( dev_out1 ) );
HANDLE_ERROR( cudaFree( dev_in2 ) );
HANDLE_ERROR( cudaFree( dev_out2 ) );
HANDLE_ERROR( cudaFree( dev_in3 ) );
HANDLE_ERROR( cudaFree( dev_out3 ) );

// free the memory we allocated on the CPU
free( host_in1 );
free( host_out1 );
free( host_in2 );
free( host_out2 );
free( host_in3 );
free( host_out3 );

printf("\nPress any key to continue...\n");
getchar();
getchar();

return 0;

}
/
void checkCUDAErrors(cudaError_t status, const char
str) {
if(status != cudaSuccess) {
printf("\n%s failed (%d): %s", str, status, cudaGetErrorString( status ));
printf("\nPress any key to continue…");
getchar();
getchar();
exit(-1);
}
}