Hi all,
I have some experience in C++ programming. I have read several online tutorials on CUDA and GPU programming but they are still pretty abstract to me. The most difficult part is on memory management. I am using Tesla C2075 card, it is said that the card hard 6G video memory but it seems that it is not mean the GPU can use all those 6G memory!? What I understand here on memory is as follows
- shared memory is the memory being shared by all threads in the same block, it has at max 48k in my device. So does it mean each block has 48k memory could be used for all threads defined inside that block? I wrote any example
#define SDSIZE 512
__global__ void mykernel(void)
{
__shared__ int n[SDSIZE];
__shared__ double m[SDSIZE];
__shared__ Complex foo[SDSIZE];
for (int i=0; i<SDSIZE; i++)
{
n[i]=i;
m[i]=(double)i;
foo[i].x = foo[i].y = (double)i;
}
}
in above code if I use the keyword shared inside the kernel function to define a variable, will that variable being shared by all threads? So if I have 10x10 blocks and each block has 80x20 threads, so each block (80x20 threads) will initialize n, m and foo once? And any modification on n will be effective to all others threads in the same block?
- I find that shared memory is not much, just 48k, but for my case, each threads need to have more than 192k for storing the intermediate calculations, so using shared memory does not work. So what memory should I used instead?
What I am trying to calculate is as follows. I am solving a equation with initial parameters (x0, y0). I need to solve the same equation with 800 different x0, and 200 different y0. My idea to use CUDA is to split (x0, y0) to be 10x10 block, each block has 80x20 threads. Each thread will responsible for solving the equation with a specific (x0, y0), according to the block index and thread index.
Among all threads, variable n, m and foo are shared variables. But we need 4 more independent variable vec1, vec2, vec3 and vec4 for each threads. I am trying to define it in device scope but it doesn’t pass the compilation. So may question is how to assign enough memory for each thread for vec1 to vec4 so they won’t share with each other?
#include <cuda.h>
#include <iostream>
#include "cuPrintf.cu"
#include <cublas_v2.h>
#include <cufft.h>
typedef cufftDoubleComplex Complex;
#define SDSIZE 512
#define VECTORSIZE 4096
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR() \
do { \
/* Check synchronous errors, i.e. pre-launch */ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
/* Check asynchronous errors, i.e. kernel failed (ULF) */ \
err = cudaThreadSynchronize(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
int InitGPUSet()
{
char GPU[100] = "GPU: ";
char str[1000];
cudaDeviceProp tCard;
int num = 0;
if (cudaSuccess == cudaGetDeviceCount(&num))
{
for (int i = 0; i < num; ++ i)
{
cudaSetDevice(i);
if (i==0)
{
cudaGetDeviceProperties(&tCard, i);
puts(strcat(GPU, tCard.name));
sprintf(str, "\nMaximum threads per block: %d\n", tCard.maxThreadsPerBlock);
puts(strcat(GPU, str));
sprintf(str, "Maximum dimension (1,2,3) of block: %d %d %d\n", tCard.maxThreadsDim[0], tCard.maxThreadsDim[1], tCard.maxThreadsDim[2]);
puts(strcat(GPU, str));
sprintf(str, "Maximum dimension (1,2,3) of grid: %d %d %d\n", tCard.maxGridSize[0], tCard.maxGridSize[1], tCard.maxGridSize[2]);
puts(strcat(GPU, str));
sprintf(str, "can map host memory: %d\n", tCard.canMapHostMemory);
puts(strcat(GPU, str));
sprintf(str, "total global memory: %d\n", tCard.totalGlobalMem);
puts(strcat(GPU, str));
sprintf(str, "shared memory per block: %d\n", tCard.sharedMemPerBlock);
puts(strcat(GPU, str));
sprintf(str, "Total registers per block: %d\n", tCard.regsPerBlock);
puts(strcat(GPU, str));
}
}
}
else return 0;
return 1;
}
__global__ void mykernel(void)
{
__shared__ int n[SDSIZE];
__shared__ double m[SDSIZE];
__shared__ Complex foo[SDSIZE];
__device__ Complex *vec1, *vec2, *vec3, *vec4;
for (int i=0; i<VECTORSIZE; i++)
{
n[i]=i;
m[i]=(double)i;
foo[i].x = foo[i].y = (double)i;
}
cudaMalloc(vec1, VECTORSIZE*sizeof(Complex));
cudaMalloc(vec2, VECTORSIZE*sizeof(Complex));
cudaMalloc(vec3, VECTORSIZE*sizeof(Complex));
cudaMalloc(vec4, VECTORSIZE*sizeof(Complex));
// all calculations here
cudaFree(vec1);
cudaFree(vec2);
cudaFree(vec3);
cudaFree(vec4);
}
int main(void)
{
if(!InitGPUSet()) puts("device is not ready!");
else
{
CHECK_LAUNCH_ERROR();
mykernel<<<dim3(10, 10) , dim3(80, 20)>>>();
CHECK_LAUNCH_ERROR();
}
}