 # help to go in the good direction with CUDA a simple but slow code in C to optimise

Hi,

I am a newbie with CUDA (I’ve installed and done my first simple program today) and I would like to transform one my code to CUDA.
However, after readings tons of informations, I am a bit afraid with what is really possible with it …

Here is my problem

I have a list of data (Donnees, up to millions of points) with three coordinates x,y,and z (Donnees[i].x, Donnees[i].y, Donnees[i].z)

I want to calculate on a cubic matrix (i,j,k) the sum of cos(f_kxDonnees[i].x+
f_ky
Donnees[i].y+f_kz*Donnees[i].z) where f_kx depends on i, f_ky depends on j, f_kz depends on k.

The maximum size of the matrix is 256256256.

Since the data is big, what can I do to optimise the C code for CUDA and what are the limits possible in term of threads, grid, and blocks and dimensions of all these strange concepts for me.
Here my C code.

for (i=0;i<=i_decx;i++)
{
f_kx= 2PI(f_kxmin + i*f_stepkx);

``````for (j=0;j<=i_decy;j++)
{
f_ky= 2*PI*(f_kymin + j*f_stepky);

for (k=0;k<=i_decz;k++)
{
f_kz= 2*PI*(f_kzmin + k*f_stepkz);

for (i2=0;i2<iNbAtom1;i2++)
{
f_k=f_kx*Donnees[i2].fPx+f_ky*Donnees[i2].fPy+f_kz*Donnees[i2].fPz;
fre[i][j][k]=fre[i][j][k]+cos (2*PI*f_k);

}
}
}
}
``````

I don’t need the CUDA code but just want to evaluate what I should do or not do …

What is the size(order) of iNbAtom1?
The size of 3D matrix is always 256256256?

The size of iNbAtom1 can vary as well as the size of the 3D matrix , the maximum is about 10000000 for the atoms (it is really a max, usually a 100000-1000000) and the 3D matrix around 100100100.

I’ve tried this code for CUDA, by using the size along x for the thread index, and 2D grid for blocks that is the y and z coordionates. with a relatively small test file (2000 atoms, 505050), it works well with a fast result, but it crashed the system with 128128128

// Kernel that executes on the CUDA device

global void fourier(float3 *a, float2 *b, int iNbAtom1, float3 f_stepk,float3 f_kmin,float3 f_kmax,int3 sizem )

{

``````float3 f_k;

float	f;

int i2;

int j=blockIdx.x;

int k=blockIdx.y;

int js=sizem.x;

int ks=sizem.y;

int is=sizem.x;
``````

if (i<is && j<js && k<ks)

``````{

f_k.x= 2*PI*(f_kmin.x + i*f_stepk.x);

f_k.y= 2*PI*(f_kmin.x + j*f_stepk.y);

f_k.z= 2*PI*(f_kmin.x + k*f_stepk.z);

for (i2=0;i2<iNbAtom1;i2++)

{

f=f_k.x*a[i2].x+f_k.y*a[i2].y+f_k.z*a[i2].z;

b[i+j*sizem.x+k*sizem.y*sizem.x].x+=cosf(f);

b[i+j*sizem.x+k*sizem.y*sizem.x].y+=sinf(f);

}

}
``````

}

here the host

float3 *a_d;

float2 *b_d;

size_t sizea = iNbAtom1 * sizeof(float3);

size_t sizeb = (i_dec.x+1) (i_dec.y+1)(i_dec.z+1)* sizeof(float2);

cudaMalloc((void **) &a_d, sizea); // Allocate array on device

`````` cudaMalloc((void **) &b_d, sizeb);   // Allocate array on device
``````

// Initialize host array and copy it to CUDA device

``````   cudaMemcpy(a_d, Donnees, sizea, cudaMemcpyHostToDevice);

int3 sizem;

sizem.x=(i_dec.x+1);

sizem.y=(i_dec.y+1);

sizem.z=(i_dec.z+1);
``````

dim3 n_blocks(i_dec.y+1,i_dec.z+1);

// Do calculation on device:

fourier <<< n_blocks, sizem.x >>> (a_d,b_d, iNbAtom1,f_stepk,f_kmin,f_kmax,sizem);

// Retrieve result from device and store it in host array

cudaMemcpy(a_h, b_d, sizeb, cudaMemcpyDeviceToHost);

I would make one kernel to compute the value for each cell in the matrix(i,j,k), then store it back to a 1-d vector of size (dimxdimydimz) in device memory, then run the sum-reduction kernel on it (code is available in the SDK project ‘reduction’).

This way, both aspects of your calculation are optimized.

Yes that looks to be a good idea…

by modifying the code of reduction I can calculate the cos for each atom. After I should use a loop to do it on each node of my 3D matrix.

But I am a little bit confuse with the use of the reduction code

/// something like that ??

template

global void foufou(float3 *g_idata, float *g_odata, float3 k_x,unsigned int n)

{

extern shared float sdata;

unsigned int i = blockIdx.x*(blockSize*2) + tid;

unsigned int gridSize = blockSize2gridDim.x;

sdata[tid] = 0;

while (i < n) {

sdata[tid] += cosf(k_x.xg_idata[i].x+k_x.yg_idata[i].y+k_x.z*g_idata[i].z)

sdata[tid] += cosf(k_x.xg_idata[i+blockSize].x+k_x.yg_idata[i+blockSize].y+k_x.z*g_idata[i+blockSize].z);

i += gridSize;

}

if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); }

if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); }

if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); }

if (tid < 32) {

if (blockSize >= 64) sdata[tid] += sdata[tid + 32];

if (blockSize >= 32) sdata[tid] += sdata[tid + 16];

if (blockSize >= 16) sdata[tid] += sdata[tid + 8];

if (blockSize >= 8) sdata[tid] += sdata[tid + 4];

if (blockSize >= 4) sdata[tid] += sdata[tid + 2];

if (blockSize >= 2) sdata[tid] += sdata[tid + 1];

}

if (tid == 0) g_odata[blockIdx.x] = sdata;

}

The problem is how is it possible to use the blocksize considering the list of atom, are where can I find the result (in g_odata or do I need to use a loop to calcultae it??)

I tried this solution … I finally found how to manage the blocksize … but
When I used it , nothing is calculated at the end ( I obtain a 0 value )
here is my code
I don’t know why, if some one has a solution …

here the code

template
global void foufou(float3 g_idata, float g_odata, float3 f_k,unsigned int n)
{
extern shared float sdata[];
unsigned int i = blockIdx.x
(blockSize
2) + tid;
unsigned int gridSize = blockSize2gridDim.x;
sdata[tid] = 0;

while (i < n) {

sdata[tid] += cos(f_k.xg_idata[i].x+f_k.yg_idata[i].y+f_k.zg_idata[i].z);
sdata[tid] += cos(f_k.x
g_idata[i+blockSize].x+f_k.yg_idata[i+blockSize].y+f_k.zg_idata[i+blockSize].z);
i += gridSize;

}

if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); }
if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); }
if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); }
if (tid < 32) {
if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
if (blockSize >= 8) sdata[tid] += sdata[tid + 4];
if (blockSize >= 4) sdata[tid] += sdata[tid + 2];
if (blockSize >= 2) sdata[tid] += sdata[tid + 1];
}
// resultat dans g_odata, un resultat par block
if (tid == 0) g_odata[blockIdx.x] = sdata;

}

and in the host

``````	int threads = 512;
int blocks  = iNbAtom1 / threads ;

float3 *g_idata;
float  *g_odata;

dim3 dimGrid(blocks, 1, 1);

size_t sizea = iNbAtom1 * sizeof(float3);
size_t sizeb = blocks* sizeof(float);

cudaMalloc((void **) &g_idata, sizea);   // Allocate array on device
cudaMalloc((void **) &g_odata, sizeb);   // Allocate array on device

// Initialize host array and copy it to CUDA device

cudaMemcpy(g_idata, Donnees, sizea, cudaMemcpyHostToDevice);

int smemSize = threads * sizeof(float);

i=0;
j=0;
k=0;

int value,l;
``````

for (i=0;i<=i_dec.x;i++)
for (j=0;j<=i_dec.y;j++)
for (k=0;k<=i_dec.z;k++)
{
f_k.x = 2 * PI * ( f_kmin.x + f_stepk.xi );
f_k.y = 2 * PI * ( f_kmin.y + f_stepk.y
j );
f_k.z = 2 * PI * ( f_kmin.z + f_stepk.z*k );

``````	foufou<512><<< dimGrid, dimBlock, smemSize >>>(g_idata, g_odata, f_k,iNbAtom1);
cudaMemcpy(a, g_odata, sizeb, cudaMemcpyDeviceToHost);
value=0;
for (l=0;l<blocks;l++)
{
value+=value + a[l];

}
printf("fkx %f ,fky %f ,fkz%f i %d j %d k %d l %d value %f\n",f_k.x,f_k.y,f_k.z,i,j,k,l,value);
``````

}

``````cudaFree(g_idata);
cudaFree(g_odata);``````

Ok it was due to me … problem fixed

I forgot to change a declaration…

Finally, I obtain an about 10* improvement with my FX1700 compared to optimise code on Core2duo …