Got it, thanks very much. But it seems difficult to minimize data movement in my scene. I’ll explain the whole process as follows if you are interested.

My model was divided into two kernel functions.

1> The first kernel function generate 10^8 particles with attributes (structure) like:

```
struct ParticleInfo
{
float energy;
float weight;
float velocityX;
float velocityY;
float velocityZ;
float positionX;
float positionY;
float positionZ;
int voxelPositionX;
int voxelPositionY;
int voxelPositionZ;
int particleType;
int absVoxelPosition;
float laminMfp;
float prob;
float voxelDensity;
int voxelMaterial;
int evtType;
// ~20 variables ......
};
```

Because the total particle number is very large, I have to divide 10^8 into 65536 * 1526. The kernel function was invoked:

```
generateParticle<<<iNumBlock, iNumThreadPerBlock>>>(ParticleDeque *pParticleQueue, 1);
```

where iNumBlock*iNumThreadPerBlock = 65536 which is approved to be efficient in this application.

In some cases 65536 particles will be put into the golbal memory pointed to pParticleInfo. HOWEVER, some threads will not generate a valid particle, so in most cases about 50000 particles will be generated (randomly). In order to avoid some threads idle in the next transport kernel function, I invoke the above kernel function in the following way:

```
int nMaxParticle = 0;
static int nInvokeTime = 0;
while(true) {
generateParticle<<<iNumBlock, iNumThreadPerBlock>>>(*pParticleQueue, 1);
cudaDeviceSynchronize();
++nInvokeTime;
cudaMemcpy(&nMaxParticle , &(pParticleQueue->queueSize), sizeof(int), cudaMemcpyDeviceToHost);
if(nMaxParticle < 65536) {
if(nInvokeTime < 1526)
continue;
else
break;
}
}
```

In this manner, the global memory “pParticleInfo” is not accessed effectively because I take it as a contineous cicular deque only when a valid particle was generated by one thread.

```
#define MAX_EVENT_SIZE 65536*2
struct ParticleDeque
{
float energy[MAX_EVENT_SIZE];
float weight[MAX_EVENT_SIZE];
// the same data to ParticleInfo, abbreviated
unsigned int queueIndex;
int queueSize;
bool queueDirty[MAX_EVENT_SIZE];
};
__device__ bool SaveTransportInfoToQueue(ParticleDeque &particleDeque, ParticleInfo const &particleInfo)
{
unsigned int index = atomicInc(&(particleDeque.queueIndex), MAX_EVENT_SIZE - 1);
particleDeque.particleEnergy[index] = particleInfo.energy;
// code to copy info from deviceParticleInfo to deviceParticleQueue, abbreviated
particleDeque.queueDirty[index] = true;
atomicAdd(&(particleDeque.queueSize), 1);
return true;
}
__global__ void generateParticle(ParticleDeque *pParticleQueue, int nDequeDim)
{
const int iThreadID = threadIdx.x + blockDim.x * blockIdx.x;
// ParticleInfo may not be in register,
// but in local memory because its size is ~20*4 = 80 bytes
ParticleInfo particleInfo;
memset(&particleInfo, 0, sizeof(ParitcleInfo));
// code generate particle info, abbreviated
int evtType = 0;
// code to judge evtType, abbreviated
if(0 != evtType){
// some particles are invalid
if(particleInfo.energy > 0.001f && particleInfo.weight > 0.001f) {
SaveTransportInfoToQueue(pParticleQueue[0], particleInfo);
}
}
return;
}
```

2> The second kernel function will get 65536 particles from the above circular queue by invoking:

```
int nMaxParticle = 0;
static int nInvokeTime = 0;
while(true) {
generateParticle<<<iNumBlock, iNumThreadPerBlock>>>(ParticleInfo *pParticleInfo, 1);
cudaDeviceSynchronize();
++nInvokeTime;
cudaMemcpy(&nMaxParticle , &(pParticleQueue->queueSize), sizeof(int), cudaMemcpyDeviceToHost);
if(nMaxParticle < 65536) {
if(nInvokeTime < 1526)
continue;
else
break;
}
transport<<<iNumBlock, iNumThreadPerBlock>>>(pParticleQueue, 1);
cudaDeviceSynchronize();
}
```

```
__device__ void CopyQueueToTransportInfo(particleQueue &particleQueue, QMCGPUPartcleTransportInfo &particleInfo, int index)
{
particleInfo.energy = particleQueue.energy[index];
// copy data from particleQueue to particleInfo, abbreviated
}
__device__ bool GetTransportInfoFromQueue(ParticleQueue &particleQueue, ParticleInfo &particleInfo)
{
// initialize
__shared__ int nRequestNum;
__shared__ int nLock;
__shared__ unsigned int nThreadIDQueue[THREAD_NUM];
if(0 != nRequestNum) {
nRequestNum = 0;
}
if(1 != nLock) {
nLock = 1;
}
nThreadIDQueue[threadIdx.x] = -1;
__syncthreads();
// get number required
nThreadIDQueue[threadIdx.x] = atomicAdd(&nRequestNum, 1);
__syncthreads();
// update global memory
__shared__ unsigned int nQueueIndex;
if(0 != atomicExch(&nLock, 0)) {
atomicSub(&particleQueue.queueSize, nRequestNum);
atomicMax(&particleQueue.queueSize, 0);
nQueueIndex = atomicSub(&particleQueue.m_iQueueIndex, nRequestNum);
}
__syncthreads();
// update local memory
unsigned int index;
if(nRequestNum == blockDim.x) {
index = nQueueIndex - threadIdx.x - 1;
}
else {
index = nQueueIndex - nThreadIDQueue[threadIdx.x] - 1;
}
index = index & (MAX_EVENT_SIZE - 1);
CopyQueueToTransportInfo(particleQueue, particleInfo, index);
particleQueue.m_bQueueDirty[index] = false;
//particleQueue.m_iEvtType[index] = -2;
particleQueue.energy[index] = 0.f;
return true;
}
__global__ void transport(DeviceParticleQueue* pDeviceParticleQueue, int nDim)
{
const int iThreadID = threadIdx.x + blockDim.x * blockIdx.x;
ParticleInfo pDeviceParticleInfo;
if(!GetTransportInfoFromQueue(pDeviceParticleQueue, pDeviceParticleInfo)) {
return;
}
// Each thread does transport individually, abbreviated
return;
}
```