help to go in the good direction with CUDA a simple but slow code in C to optimise

Hi,

I am a newbie with CUDA (I’ve installed and done my first simple program today) and I would like to transform one my code to CUDA.
However, after readings tons of informations, I am a bit afraid with what is really possible with it …

Here is my problem

I have a list of data (Donnees, up to millions of points) with three coordinates x,y,and z (Donnees[i].x, Donnees[i].y, Donnees[i].z)

I want to calculate on a cubic matrix (i,j,k) the sum of cos(f_kxDonnees[i].x+
f_ky
Donnees[i].y+f_kz*Donnees[i].z) where f_kx depends on i, f_ky depends on j, f_kz depends on k.

The maximum size of the matrix is 256256256.

Since the data is big, what can I do to optimise the C code for CUDA and what are the limits possible in term of threads, grid, and blocks and dimensions of all these strange concepts for me.
Here my C code.

for (i=0;i<=i_decx;i++)
{
f_kx= 2PI(f_kxmin + i*f_stepkx);

for (j=0;j<=i_decy;j++)
{
	f_ky= 2*PI*(f_kymin + j*f_stepky);

	for (k=0;k<=i_decz;k++)
	{
		f_kz= 2*PI*(f_kzmin + k*f_stepkz);
		
		for (i2=0;i2<iNbAtom1;i2++)
		{
                f_k=f_kx*Donnees[i2].fPx+f_ky*Donnees[i2].fPy+f_kz*Donnees[i2].fPz;
			fre[i][j][k]=fre[i][j][k]+cos (2*PI*f_k);
		
		}
		}
	}
}

I don’t need the CUDA code but just want to evaluate what I should do or not do …

If anyone can answer me, thanks in advance

What is the size(order) of iNbAtom1?
The size of 3D matrix is always 256256256?

The size of iNbAtom1 can vary as well as the size of the 3D matrix , the maximum is about 10000000 for the atoms (it is really a max, usually a 100000-1000000) and the 3D matrix around 100100100.

I’ve tried this code for CUDA, by using the size along x for the thread index, and 2D grid for blocks that is the y and z coordionates. with a relatively small test file (2000 atoms, 505050), it works well with a fast result, but it crashed the system with 128128128

// Kernel that executes on the CUDA device

global void fourier(float3 *a, float2 *b, int iNbAtom1, float3 f_stepk,float3 f_kmin,float3 f_kmax,int3 sizem )

{

float3 f_k;

float	f;

int i2;

 int i=threadIdx.x;

 int j=blockIdx.x;

 int k=blockIdx.y;

 int js=sizem.x;

 int ks=sizem.y;

 int is=sizem.x;

if (i<is && j<js && k<ks)

{

f_k.x= 2*PI*(f_kmin.x + i*f_stepk.x);

f_k.y= 2*PI*(f_kmin.x + j*f_stepk.y);

f_k.z= 2*PI*(f_kmin.x + k*f_stepk.z);

		

		for (i2=0;i2<iNbAtom1;i2++)

		{

			f=f_k.x*a[i2].x+f_k.y*a[i2].y+f_k.z*a[i2].z;

			b[i+j*sizem.x+k*sizem.y*sizem.x].x+=cosf(f);

			b[i+j*sizem.x+k*sizem.y*sizem.x].y+=sinf(f);

		}

	

}

}

here the host

float3 *a_d;

float2 *b_d;

size_t sizea = iNbAtom1 * sizeof(float3);

size_t sizeb = (i_dec.x+1) (i_dec.y+1)(i_dec.z+1)* sizeof(float2);

cudaMalloc((void **) &a_d, sizea); // Allocate array on device

 cudaMalloc((void **) &b_d, sizeb);   // Allocate array on device      

// Initialize host array and copy it to CUDA device

   cudaMemcpy(a_d, Donnees, sizea, cudaMemcpyHostToDevice);  

 int3 sizem;

 sizem.x=(i_dec.x+1);

 sizem.y=(i_dec.y+1);

 sizem.z=(i_dec.z+1);

dim3 n_blocks(i_dec.y+1,i_dec.z+1);

// Do calculation on device:

fourier <<< n_blocks, sizem.x >>> (a_d,b_d, iNbAtom1,f_stepk,f_kmin,f_kmax,sizem);

// Retrieve result from device and store it in host array

cudaMemcpy(a_h, b_d, sizeb, cudaMemcpyDeviceToHost);

I would make one kernel to compute the value for each cell in the matrix(i,j,k), then store it back to a 1-d vector of size (dimxdimydimz) in device memory, then run the sum-reduction kernel on it (code is available in the SDK project ‘reduction’).

This way, both aspects of your calculation are optimized.

Yes that looks to be a good idea…

by modifying the code of reduction I can calculate the cos for each atom. After I should use a loop to do it on each node of my 3D matrix.

But I am a little bit confuse with the use of the reduction code

/// something like that ??

template

global void foufou(float3 *g_idata, float *g_odata, float3 k_x,unsigned int n)

{

extern shared float sdata;

unsigned int tid = threadIdx.x;

unsigned int i = blockIdx.x*(blockSize*2) + tid;

unsigned int gridSize = blockSize2gridDim.x;

sdata[tid] = 0;

while (i < n) {

sdata[tid] += cosf(k_x.xg_idata[i].x+k_x.yg_idata[i].y+k_x.z*g_idata[i].z)

sdata[tid] += cosf(k_x.xg_idata[i+blockSize].x+k_x.yg_idata[i+blockSize].y+k_x.z*g_idata[i+blockSize].z);

i += gridSize;

}

__syncthreads();

if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); }

if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); }

if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); }

if (tid < 32) {

if (blockSize >= 64) sdata[tid] += sdata[tid + 32];

if (blockSize >= 32) sdata[tid] += sdata[tid + 16];

if (blockSize >= 16) sdata[tid] += sdata[tid + 8];

if (blockSize >= 8) sdata[tid] += sdata[tid + 4];

if (blockSize >= 4) sdata[tid] += sdata[tid + 2];

if (blockSize >= 2) sdata[tid] += sdata[tid + 1];

}

if (tid == 0) g_odata[blockIdx.x] = sdata[0];

}

The problem is how is it possible to use the blocksize considering the list of atom, are where can I find the result (in g_odata[0] or do I need to use a loop to calcultae it??)

I tried this solution … I finally found how to manage the blocksize … but
When I used it , nothing is calculated at the end ( I obtain a 0 value )
here is my code
I don’t know why, if some one has a solution …

here the code

template
global void foufou(float3 g_idata, float g_odata, float3 f_k,unsigned int n)
{
extern shared float sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x
(blockSize
2) + tid;
unsigned int gridSize = blockSize2gridDim.x;
sdata[tid] = 0;

while (i < n) {

sdata[tid] += cos(f_k.xg_idata[i].x+f_k.yg_idata[i].y+f_k.zg_idata[i].z);
sdata[tid] += cos(f_k.x
g_idata[i+blockSize].x+f_k.yg_idata[i+blockSize].y+f_k.zg_idata[i+blockSize].z);
i += gridSize;

}

__syncthreads();

if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); }
if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); }
if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); }
if (tid < 32) {
if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
if (blockSize >= 8) sdata[tid] += sdata[tid + 4];
if (blockSize >= 4) sdata[tid] += sdata[tid + 2];
if (blockSize >= 2) sdata[tid] += sdata[tid + 1];
}
// resultat dans g_odata, un resultat par block
if (tid == 0) g_odata[blockIdx.x] = sdata[0];

}

and in the host

	int threads = 512;
	int blocks  = iNbAtom1 / threads ;
	
	float3 *g_idata;
	float  *g_odata;

	dim3 dimBlock(threads, 1, 1);
    dim3 dimGrid(blocks, 1, 1);

	size_t sizea = iNbAtom1 * sizeof(float3);
	size_t sizeb = blocks* sizeof(float);

    cudaMalloc((void **) &g_idata, sizea);   // Allocate array on device  
    cudaMalloc((void **) &g_odata, sizeb);   // Allocate array on device      
 
    // Initialize host array and copy it to CUDA device

    cudaMemcpy(g_idata, Donnees, sizea, cudaMemcpyHostToDevice);  

	int smemSize = threads * sizeof(float);

	i=0;
	j=0;
	k=0;
	
	int value,l;

for (i=0;i<=i_dec.x;i++)
for (j=0;j<=i_dec.y;j++)
for (k=0;k<=i_dec.z;k++)
{
f_k.x = 2 * PI * ( f_kmin.x + f_stepk.xi );
f_k.y = 2 * PI * ( f_kmin.y + f_stepk.y
j );
f_k.z = 2 * PI * ( f_kmin.z + f_stepk.z*k );

	foufou<512><<< dimGrid, dimBlock, smemSize >>>(g_idata, g_odata, f_k,iNbAtom1);
	cudaMemcpy(a, g_odata, sizeb, cudaMemcpyDeviceToHost);
	value=0;
	for (l=0;l<blocks;l++)
	{
		value+=value + a[l];

	}
	printf("fkx %f ,fky %f ,fkz%f i %d j %d k %d l %d value %f\n",f_k.x,f_k.y,f_k.z,i,j,k,l,value);

}

cudaFree(g_idata);
cudaFree(g_odata);

Ok it was due to me … problem fixed

I forgot to change a declaration…

Finally, I obtain an about 10* improvement with my FX1700 compared to optimise code on Core2duo …
not bad

I get another 3-5% increase if I use registers in the first while loop of the reduction kernel and put them back to shared memory after the while loop… Not huge but not a whole lot of work either, eh?