How to set the variables in the global memory to zero effectively? initialize global memory

How to set the variables in the global memory to zero effectively?

I am modifing my finite elment program to use cuda.

The main time consuming part in the program is the compuation of internal force of each node.

In cpu code, there are mainly the following steps

  1. set FORCE to zero

  2. loop on each element, and compute the force on each node of current element

  3. summary the force together

According to this thinking , I made the folloing program.

[codebox]

#include <stdio.h>

#define BLOCK_NUM 1

#define THREAD_NUM 10

#define NUM_ELE 4

#define NUM_NOD 18

#define NC(j,N) *(NC+(N-1)*8+j-1)

#define FORCE(N) *(FORCE+N-1)

global void internal_kernel(int* , float *);

int main(void)

{

int NC[8*NUM_ELE];

float FORCE[18];

NC(1,1)=1  ;

NC(2,1)=3  ;

NC(3,1)=9  ;

NC(4,1)=7  ;

NC(5,1)=2  ;

NC(6,1)=4  ;

NC(7,1)=10 ;

NC(8,1)=8  ;

NC(1,2)=3  ;

NC(2,2)=5  ;

NC(3,2)=11 ;

NC(4,2)=9  ;

NC(5,2)=4  ;

NC(6,2)=6  ;

NC(7,2)=12 ;

NC(8,2)=10 ;

NC(1,3)=7 ;

    NC(2,3)=9  ;

    NC(3,3)=15 ;

    NC(4,3)=13 ;

    NC(5,3)=8  ;

    NC(6,3)=10 ;

    NC(7,3)=16 ;

    NC(8,3)=14 ;

NC(1,4)=9 ;

    NC(2,4)=11 ;

    NC(3,4)=17 ;

    NC(4,4)=15 ;

    NC(5,4)=10 ;

    NC(6,4)=12 ;

    NC(7,4)=18 ;

    NC(8,4)=16 ;

int *dNC;

float *dFORCE;

cudaMalloc( (void**)&dNC,NUM_ELE*8*sizeof(int));

cudaMalloc( (void**)&dFORCE,NUM_NOD*sizeof(float));

cudaMemcpy(dNC,NC,NUM_ELE*8*sizeof(int),cudaMemcpyHostToDevi

ce);

// call compuation kernel first time

internal_kernel<<<BLOCK_NUM,THREAD_NUM>>>(dNC,dFORCE);

cudaMemcpy(FORCE,dFORCE,NUM_NOD*sizeof(float),cudaMemcpyDevi

ceToHost);

for(int i=1;i<=NUM_NOD;i++)

	printf("NOD %d %f\n",i,FORCE(i));

//call computation kernel second time

internal_kernel<<<BLOCK_NUM,THREAD_NUM>>>(dNC,dFORCE);

cudaMemcpy(FORCE,dFORCE,NUM_NOD*sizeof(float),cudaMemcpyDevi

ceToHost);

for(int i=1;i<=NUM_NOD;i++)

	printf("NOD %d %f\n",i,FORCE(i));

cudaFree(dNC);

cudaFree(dFORCE);

return 0;

}

global void internal_kernel(int* NC, float *FORCE)

{

const int tid=threadIdx.x;

const int bid= blockIdx.x;

const int THREADS=blockDim.x;

const int BLOCKS=gridDim.x;

float FORCE_ELE[8];

//initialize the force_int value

for(int i=bid*THREADS+tid;i<NUM_NOD;i+=BLOCKS*THREADS)

	*(FORCE+i)=0.0;

__syncthreads();

//compute internal force

for(int NELE=bid*THREADS+tid+1;NELE<=NUM_ELE;NELE+=BLOCKS*THREADS)

{

	for(int j=1;j<=8;j++)

		FORCE_ELE[j-1]=NELE*10.0+j*1.0;

	for(int j=1;j<=8;j++)

	{

		int NOD;

		NOD=NC(j,NELE);

		FORCE(NOD)=FORCE(NOD)+FORCE_ELE[j-1];

// printf(“NELE: %1d J:%1d NOD:%2d \t FORCE_ELE:%10.1f\t FORCE:%10.1f\n”,NELE,j,NOD,FORCE_ELE[j-1],FORCE(NOD));

	}

}

__syncthreads();

return ;

}

[/codebox]

As the FORCE array in global memory should be initilized to zero, I want to do this on GPU side but not on CPU side to improve efficiency.

(If initialized on CPU side, lots of zeros have to be copied from host to device.)

If I use many threads but only one block to perform the computation, as the __syncthreads() can be used within a block, the results are correct.

But when more blocks were used, as the performing order for each block are random and can not use _syncthreads between blocks, the above will lead to wrong result.

Anyone can give me some advice to solve this problem?

Thank you in advance.

I have thought a method.

use another seperate init_force_kernel to deal with this.

There is an another question .

Since the unstructured meshes areused, How can I improve the efficiency of the program ?

It seems that using texture fetch to access the NC array may be one of the means.

Any other means can be used?

Thank you in advance.

[codebox]

#include <stdio.h>

#define BLOCK_NUM 2

#define THREAD_NUM 1

#define NUM_ELE 4

#define NUM_NOD 18

#define NC(j,N) *(NC+(N-1)*8+j-1)

#define FORCE(N) *(FORCE+N-1)

global void init_force_kernel(float *);

global void internal_kernel(int* , float *);

int main(void)

{

int NC[8*NUM_ELE];

float FORCE[18];

NC(1,1)=1  ;

NC(2,1)=3  ;

NC(3,1)=9  ;

NC(4,1)=7  ;

NC(5,1)=2  ;

NC(6,1)=4  ;

NC(7,1)=10 ;

NC(8,1)=8  ;

NC(1,2)=3  ;

NC(2,2)=5  ;

NC(3,2)=11 ;

NC(4,2)=9  ;

NC(5,2)=4  ;

NC(6,2)=6  ;

NC(7,2)=12 ;

NC(8,2)=10 ;

NC(1,3)=7 ;

NC(2,3)=9  ;

NC(3,3)=15 ;

NC(4,3)=13 ;

NC(5,3)=8  ;

NC(6,3)=10 ;

NC(7,3)=16 ;

NC(8,3)=14 ;

NC(1,4)=9 ;

NC(2,4)=11 ;

NC(3,4)=17 ;

NC(4,4)=15 ;

NC(5,4)=10 ;

NC(6,4)=12 ;

NC(7,4)=18 ;

NC(8,4)=16 ;

int *dNC;

float *dFORCE;

cudaMalloc( (void**)&dNC,NUM_ELE*8*sizeof(int));

cudaMalloc( (void**)&dFORCE,NUM_NOD*sizeof(float));

cudaMemcpy(dNC,NC,NUM_ELE*8*sizeof(int),cudaMemcpyHostToDevi

ce);

// call compuation kernel first time

// initialize the force first

init_force_kernel<<<BLOCK_NUM,THREAD_NUM>>>(dFORCE);

internal_kernel<<<BLOCK_NUM,THREAD_NUM>>>(dNC,dFORCE);

cudaMemcpy(FORCE,dFORCE,NUM_NOD*sizeof(float),cudaMemcpyDevi

ceToHost);

for(int i=1;i<=NUM_NOD;i++)

	printf("NOD %d %f\n",i,FORCE(i));

cudaFree(dNC);

cudaFree(dFORCE);

return 0;

}

global void init_force_kernel(float *FORCE)

{

const int tid=threadIdx.x;

const int bid= blockIdx.x;

const int THREADS=blockDim.x;

const int BLOCKS=gridDim.x;

//initialize the force_int value

for(int i=bid*THREADS+tid;i<NUM_NOD;i+=BLOCKS*THREADS)

	*(FORCE+i)=0.0;

return ;

}

global void internal_kernel(int* NC, float *FORCE)

{

const int tid=threadIdx.x;

const int bid= blockIdx.x;

const int THREADS=blockDim.x;

const int BLOCKS=gridDim.x;

float FORCE_ELE[8];

//compute internal force

for(int NELE=bid*THREADS+tid+1;NELE<=NUM_ELE;NELE+=BLOCKS*THREADS)

{

	for(int j=1;j<=8;j++)

		FORCE_ELE[j-1]=NELE*10.0+j*1.0;

	for(int j=1;j<=8;j++)

	{

		int NOD;

		NOD=NC(j,NELE);

		FORCE(NOD)=FORCE(NOD)+FORCE_ELE[j-1];

// printf(“NELE: %1d J:%1d NOD:%2d \t FORCE_ELE:%10.1f\t FORCE:%10.1f\n”,NELE,j,NOD,FORCE_ELE[j-1],FORCE(NOD));

	}

}

return ;

}

[/codebox]

Just use cudaMemset

Thank you for your good advice.

Can I ask one more question?

Is there problem when multi blocks access the same global memory position at the same time?

As the program on the above ,

FORCE(NOD)=FORCE(NOD)+FORCE_ELE[j-1];

Thank you very much.

[codebox]

global void internal_kernel(int* NC, float *FORCE)

{

const int tid=threadIdx.x;

const int bid= blockIdx.x;

const int THREADS=blockDim.x;

const int BLOCKS=gridDim.x;

float FORCE_ELE[8];

//compute internal force

for(int NELE=bid*THREADS+tid+1;NELE<=NUM_ELE;NELE+=BLOCKS*THREADS)

{

	for(int j=1;j<=8;j++)

		FORCE_ELE[j-1]=NELE*10.0+j*1.0;

	for(int j=1;j<=8;j++)

	{

		int NOD;

		NOD=NC(j,NELE);

		FORCE(NOD)=FORCE(NOD)+FORCE_ELE[j-1];

// printf(“NELE: %1d J:%1d NOD:%2d \t FORCE_ELE:%10.1f\t FORCE:%10.1f\n”,NELE,j,NOD,FORCE_ELE[j-1],FORCE(NOD));

	}

}

return ;

}

[/codebox]

Reading from the same location in multiple threads/blocks does not cause problems. Writing to the same location from multiple threads/blocks results in undefined behavior. I think the only guarantee you have is that at least one will succeed. Also, if you are writing to global memory from one thread and then reading from that same location in another, you will need something like the new threadfence() functions to ensure you don’t have read after write issues.

It seems that I should consider another method to summarize the force but not directly as the above to avoid the writing conflict.

Thank you for your kind reply.