how to use all global memory in program cuda

ricardoSanchez · May 18, 2017, 10:24pm

this is my code of my file example.cu
I have a graphic card GeForce GT 640
in your specification show me 2Gb global memory
but I use the metric of my first array
1500015000 → dimension array
4 → array type int (4 bytes)
2 → two copies to device
150001500042 = 1800000000 bytes
and I have error cudaMemcpyDeviceToHost failed (11) invalid arguments
also I use the metric of my second array
85008500 → dimension array
4 → array type int (4 bytes)
6 → two copies to device
8500850046 = 1734000000 bytes
and I have the same error cudaMemcpyDeviceToHost failed (11) invalid arguments

#include <stdio.h>
#include <time.h>

#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

#define N2_1D (15000 * 15000)
#define N2_X (15000)
#define N2_Y (15000)
#define N6_1D (8500 * 8500)
#define N6_X (8500)
#define N6_Y (8500)

#define DIM (32 * 16)
#define DIM_X (32)
#define DIM_Y (16)

//device int atomicAdd(int *address, int val);

global void add2_1D( int *in, int *out, int dim ) {
//unsigned int index = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int index = gridDim.x * blockDim.x * (blockDim.x * blockIdx.x + threadIdx.x);
if (index < dim) {
out[index] = in[index];
}
/while (index < dim) {
out[index] = in[index];
index += blockDim.x * gridDim.x;
}/
}
global void add2_2D( int *in, int *out, int dim ) {
unsigned int thread = blockDim.x * threadIdx.y + threadIdx.x;
unsigned int block = gridDim.x * blockIdx.y + blockIdx.x;
unsigned int bDim = blockDim.x * blockDim.y;
unsigned int index = bDim * block + thread;
//unsigned int indexX = blockDim.x * blockIdx.x + threadIdx.x;
//unsigned int indexY = blockDim.y * blockIdx.y + threadIdx.y;
//unsigned int index = gridDim.x * blockDim.x * indexY + indexX;
if (index < dim) {
out[index] = in[index];
}
}
global void add6_1D( int *in1, int *in2, int *in3, int *out1, int *out2, int *out3, int dim ) {
//unsigned int index = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int index = gridDim.x * blockDim.x * (blockDim.x * blockIdx.x + threadIdx.x);
if (index < dim) {
out1[index] = in1[index];
out2[index] = in2[index];
out3[index] = in3[index];
}
/while (index < dim) {
out1[index] = in1[index];
out2[index] = in2[index];
out3[index] = in3[index];
index += blockDim.x * gridDim.x;
}/
}
global void add6_2D( int *in1, int *in2, int *in3, int *out1, int *out2, int *out3, int dim ) {
//unsigned int thread = blockDim.x * threadIdx.y + threadIdx.x;
//unsigned int block = gridDim.x * blockIdx.y + blockIdx.x;
//unsigned int bDim = blockDim.x * blockDim.y;
//unsigned int index = bDim * block + thread;
unsigned int indexX = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int indexY = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int index = gridDim.x * blockDim.x * indexY + indexX;
if (index < dim) {
out1[index] = in1[index];
out2[index] = in2[index];
out3[index] = in3[index];
}
}

void checkCUDAErrors(cudaError_t, const char*);

#define HANDLE_ERROR( err ) (HandleError( err, FILE, LINE ))
static void HandleError( cudaError_t err, const char *file, int line ) {
if (err != cudaSuccess) {
printf( “%s in %s at line %d\n”, cudaGetErrorString( err ), file, line );
exit( EXIT_FAILURE );
}
}

int main( int argc, char *argv ) {
cudaError_t cudaStatus;

int *host_in1, *dev_in1;
int *host_out1, *dev_out1;

// allocate the memory on the CPU
host_in1   = (int*)malloc( N2_1D * sizeof(int) );
host_out1  = (int*)malloc( N2_1D * sizeof(int) );

// fill the arrays 'a' and 'b' on the CPU
for (int i=0; i<N2_1D; i++) {
	host_in1[i] = i;
}

// allocate the memory on the GPU
cudaStatus = cudaMalloc( (void**)&dev_in1, N2_1D * sizeof(int) );
checkCUDAErrors(cudaStatus,"cudaMalloc");
cudaStatus = cudaMalloc( (void**)&dev_out1, N2_1D * sizeof(int) );
checkCUDAErrors(cudaStatus,"cudaMalloc");

// copy the arrays to the GPU
cudaStatus = cudaMemcpy( dev_in1, host_in1, N2_1D * sizeof(int), cudaMemcpyHostToDevice );
checkCUDAErrors(cudaStatus,"cudaMemcpyHostToDevice");

//execute the kernel	
//dim3 blockspergrid, threadsperblock;
//threadsperblock.x = DIM;
//blockspergrid.x = (N2_1D + (DIM - 1)) / DIM;
//printf( "blockspergrid %d threadsperblock %d\n", blockspergrid.x, threadsperblock.x);
//add2_1D<<<blockspergrid, threadsperblock>>>( dev_in1, dev_out1, N2_1D );

//execute the kernel	
dim3 blockspergrid, threadsperblock;
threadsperblock.x = DIM_X;				threadsperblock.y = DIM_Y;
blockspergrid.x = (N2_X + (DIM_X - 1)) / DIM_X;		blockspergrid.y = (N2_Y + (DIM_Y - 1)) / DIM_Y;
printf( "blockspergrid %d %d threadsperblock %d %d\n", blockspergrid.x, blockspergrid.y, threadsperblock.x, threadsperblock.y);
add2_2D<<<blockspergrid, threadsperblock>>>( dev_in1, dev_out1, N2_1D );

//cudaStatus = cudaThreadSynchronize();
//checkCUDAErrors(cudaStatus,"cudaThreadSynchronize");
cudaStatus = cudaDeviceSynchronize();
checkCUDAErrors(cudaStatus,"cudaDeviceSynchronize");

// copy the array back from the GPU to the CPU
cudaStatus = cudaMemcpy( host_out1, dev_out1, N2_1D * sizeof(int), cudaMemcpyDeviceToHost );
checkCUDAErrors(cudaStatus,"cudaMemcpyDeviceToHost");

// free the memory we allocated on the GPU
HANDLE_ERROR( cudaFree( dev_in1 ) );
HANDLE_ERROR( cudaFree( dev_out1 ) );

// free the memory we allocated on the CPU
free( host_in1 );
free( host_out1 );

printf("\nPress any key to continue...\n");
getchar();
getchar();

return 0;

}

/*
int main( int argc, char *argv ) {
cudaError_t cudaStatus;

int *host_in1, *dev_in1;
int *host_in2, *dev_in2;
int *host_in3, *dev_in3;
int *host_out1, *dev_out1;
int *host_out2, *dev_out2;
int *host_out3, *dev_out3;

// allocate the memory on the CPU
host_in1   = (int*)malloc( N6_1D * sizeof(int) );
host_out1  = (int*)malloc( N6_1D * sizeof(int) );
host_in2   = (int*)malloc( N6_1D * sizeof(int) );
host_out2  = (int*)malloc( N6_1D * sizeof(int) );
host_in3   = (int*)malloc( N6_1D * sizeof(int) );
host_out3  = (int*)malloc( N6_1D * sizeof(int) );

// fill the arrays 'a' and 'b' on the CPU
for (int i=0; i<N6_1D; i++) {
	host_in1[i] = i;
	host_in2[i] = i;
	host_in3[i] = i;
}

// allocate the memory on the GPU
cudaStatus = cudaMalloc( (void**)&dev_in1, N6_1D * sizeof(int) );
checkCUDAErrors(cudaStatus,"cudaMalloc");
cudaStatus = cudaMalloc( (void**)&dev_out1, N6_1D * sizeof(int) );
checkCUDAErrors(cudaStatus,"cudaMalloc");
cudaStatus = cudaMalloc( (void**)&dev_in2, N6_1D * sizeof(int) );
checkCUDAErrors(cudaStatus,"cudaMalloc");
cudaStatus = cudaMalloc( (void**)&dev_out2, N6_1D * sizeof(int) );
checkCUDAErrors(cudaStatus,"cudaMalloc");
cudaStatus = cudaMalloc( (void**)&dev_in3, N6_1D * sizeof(int) );
checkCUDAErrors(cudaStatus,"cudaMalloc");
cudaStatus = cudaMalloc( (void**)&dev_out3, N6_1D * sizeof(int) );
checkCUDAErrors(cudaStatus,"cudaMalloc");

// copy the arrays to the GPU
cudaStatus = cudaMemcpy( dev_in1, host_in1, N6_1D * sizeof(int), cudaMemcpyHostToDevice );
checkCUDAErrors(cudaStatus,"cudaMemcpyHostToDevice");
cudaStatus = cudaMemcpy( dev_in2, host_in2, N6_1D * sizeof(int), cudaMemcpyHostToDevice );
checkCUDAErrors(cudaStatus,"cudaMemcpyHostToDevice");
cudaStatus = cudaMemcpy( dev_in3, host_in3, N6_1D * sizeof(int), cudaMemcpyHostToDevice );
checkCUDAErrors(cudaStatus,"cudaMemcpyHostToDevice");

//execute the kernel	
//dim3 blockspergrid, threadsperblock;
//threadsperblock.x = DIM;
//blockspergrid.x = (N6_1D + (DIM - 1)) / DIM;
//printf( "blockspergrid %d threadsperblock %d\n", blockspergrid.x, threadsperblock.x);
//add6_1D<<<blockspergrid, threadsperblock>>>( dev_in1, dev_in2, dev_in3, dev_out1, dev_out2, dev_out3, N6_1D );

//execute the kernel	
dim3 blockspergrid, threadsperblock;
threadsperblock.x = DIM_X;				threadsperblock.y = DIM_Y;
blockspergrid.x = (N6_X + (DIM_X - 1)) / DIM_X;		blockspergrid.y = (N6_Y + (DIM_Y - 1)) / DIM_Y;
printf( "blockspergrid %d %d threadsperblock %d %d\n", blockspergrid.x, blockspergrid.y, threadsperblock.x, threadsperblock.y);
add6_2D<<<blockspergrid, threadsperblock>>>( dev_in1, dev_in2, dev_in3, dev_out1, dev_out2, dev_out3, N6_1D );

cudaStatus = cudaThreadSynchronize();
checkCUDAErrors(cudaStatus,"cudaThreadSynchronize");
//cudaStatus = cudaDeviceSynchronize();
//checkCUDAErrors(cudaStatus,"cudaDeviceSynchronize");

// copy the array back from the GPU to the CPU
cudaStatus = cudaMemcpy( host_out1, dev_out1, N6_1D * sizeof(int), cudaMemcpyDeviceToHost );
checkCUDAErrors(cudaStatus,"cudaMemcpyDeviceToHost");
cudaStatus = cudaMemcpy( host_out2, dev_out2, N6_1D * sizeof(int), cudaMemcpyDeviceToHost );
checkCUDAErrors(cudaStatus,"cudaMemcpyDeviceToHost");
cudaStatus = cudaMemcpy( host_out3, dev_out3, N6_1D * sizeof(int), cudaMemcpyDeviceToHost );
checkCUDAErrors(cudaStatus,"cudaMemcpyDeviceToHost");

// free the memory we allocated on the GPU
HANDLE_ERROR( cudaFree( dev_in1 ) );
HANDLE_ERROR( cudaFree( dev_out1 ) );
HANDLE_ERROR( cudaFree( dev_in2 ) );
HANDLE_ERROR( cudaFree( dev_out2 ) );
HANDLE_ERROR( cudaFree( dev_in3 ) );
HANDLE_ERROR( cudaFree( dev_out3 ) );

// free the memory we allocated on the CPU
free( host_in1 );
free( host_out1 );
free( host_in2 );
free( host_out2 );
free( host_in3 );
free( host_out3 );

printf("\nPress any key to continue...\n");
getchar();
getchar();

return 0;

}
/
void checkCUDAErrors(cudaError_t status, const char str) {
if(status != cudaSuccess) {
printf(“\n%s failed (%d): %s”, str, status, cudaGetErrorString( status ));
printf(“\nPress any key to continue…”);
getchar();
getchar();
exit(-1);
}
}

Topic		Replies	Views
Why cannot run this program CUDA Programming and Performance	7	951	May 11, 2018
Dump/inspect NVIDIA GPU global memory contents corresponding to arbitrary (but not invalid) addresses CUDA-GDB	3	255	November 8, 2024
Using shared memory CUDA Programming and Performance	3	99	August 9, 2024
GPU - CPU Performance comparison on string conversion i7 860 3.5GHz beat out NVidia 9800 GT CUDA Programming and Performance	11	10662	January 4, 2011
Global memory occupied until cudaDeviceReset() or app exits CUDA Programming and Performance	0	2507	June 25, 2014
Performance test sharedmemory <-> globalmemory CUDA Programming and Performance	2	3933	May 30, 2008
cudaMemcpy() returns success and copy incorrect data CUDA Programming and Performance	3	2157	March 4, 2017
How to modify the Cuda output CUDA Programming and Performance	0	447	May 21, 2018
Memory problem? ...incredible slowdown CUDA Programming and Performance	29	16301	January 30, 2011
Global arrays? CUDA Programming and Performance	24	10629	August 18, 2010

how to use all global memory in program cuda

Related topics