In cuda-samples/Samples/2_Concepts_and_Techniques/particles at master · NVIDIA/cuda-samples · GitHub, it seems that its texture memory feature is removed (but in the PDF documentation, there is part discussing performance increase via texture memory).

I found another repository, which seems to be more older version which utilizes texture memory feature.

(cuda-sample/particles_kernel_impl.cuh at master · zchee/cuda-sample · GitHub)

I wonder if this removal of texture memory in the recent version is for some reasonable reason? Maybe recent GPU products are already good enough that small non-coalesced memory access is non of particular issue?

My program makes neighbor list of each particles, and in the **global** kernel, I have to access some quantities of that neighbor particle and this is un-avoidably non-coalesced (exactly as the /particles PDF documents says).

To be more specific, I limit the maximum number of neighbors as MAX_NEIGHBOR, and if there are N particles, then I have a Neighbors[N*MAX_NEIGHBOR]. And also I carry another array Quantity[N] that carries some information of the particles.

Then I can access the lists of neighbors in coalesced manner, but cannot come up with to efficient method to access Quantity.

```
int index = blockIdx.x;
if (index > N) return;
int neighborIndex = Neighbors[MAX_NEIGHBOR*index + threadIdx.x]; // coalesced
double quantity = Quantity[neighborIndex]; // cannot be coalesced
```

My neighbor list is generated at the very first time step, and it is about the bonds between the nearby particles. So this neighbor list does not change during the simulation (while the positions of particles could be very deformed).

Therefore, I expect that somehow I should be able to optimize this kind of operation via texture memory or some special sorting but as a one-month newcomer to CUDA I cannot get it optimized. And I have no idea on how I can use texture memory on this purpose. Compared to more basic components of CUDA, informations and online posts on the texture memory seems quite limited and most of existing seems to related to graphics…

Any suggestions, keywords, or samples will be highly appreciated.

The following is the kernels that I am using now. Here I need to access Pos, and Vol of the neighbor particles. Also Vel is read, but it has no role so far. I have put it there just to prepare next implementations of more sophisticated models.

```
__global__ void calcBondAccelD(double4 *DVel_Dt, // output
double4 *oldPos, // inputs : oldPos[numParticles]
double4 *oldVel, // oldVel[numParticles]
double *Rho, // Rho[numParticles]
double *Vol, // Vol[numParticles]
double *BondStretch, // BondStretch[numParticles*MAX_NEIGHBOR]
double *Mu, // Mu[numParticles*MAX_NEIGHBOR]
int *Type, // Type[numParticles]
int *Neighbors, // Neighbors[numParticles*MAX_NEIGHBOR]
int *numNeighbors, // numNeighbors[numParticles]
int numParticles) {
int sortedIndex = blockIdx.x * blockDim.x + threadIdx.x; // one particle per thread
if (sortedIndex >= numParticles) return;
if ( (Type[sortedIndex] != 0) && (Type[sortedIndex] != 1) ) return; // bonds only for main particles
double3 pos = make_double3(oldPos[sortedIndex]);
double3 vel = make_double3(oldVel[sortedIndex]);
double3 force = make_double3(0.0);
// examine neighbor list
for (int i = 0; i<numNeighbors[sortedIndex]; i++) {
int neighborIndex = Neighbors[MAX_NEIGHBOR * sortedIndex + i];
//if (Type[neighborIndex] != -1) { // SJLee: should I remove lost particle here?
double3 pos2 = make_double3(oldPos[neighborIndex]);
double3 relPos = pos2 - pos;
double dist = length(relPos);
double _delta = params.horizon;
double _c = 18.*params.modBulk / (CUDART_PI * _delta*_delta*_delta*_delta);
force += _c * BondStretch[sortedIndex*MAX_NEIGHBOR + i] * Mu[sortedIndex*MAX_NEIGHBOR + i]
* Vol[neighborIndex] * relPos / dist;
//}
}
// write new velocity back to original unsorted location
DVel_Dt[sortedIndex] = make_double4(force / Rho[sortedIndex], 0.0);
}
```

Obviously this is very naive implementation, so I am trying some variants of this but this is even slower than the previous naive implementation sadly.

```
__global__ void calcBondAccelD2(double4 *DVel_Dt, // output
double4 *oldPos, // intputs
double4 *oldVel,
double *Rho,
double *Vol,
double *BondStretch,
double *Mu,
int *Type,
int *Neighbors,
int *numNeighbors,
int numParticles) {
int sortedIndex = blockIdx.x; // one particle per one block, threads taking care of its neighbors
__shared__ double _s[256][6];
if (sortedIndex >= numParticles) return;
if ( (Type[sortedIndex] != 0) && (Type[sortedIndex] != 1) ) return; // bonds only for main particles
double3 pos = make_double3(oldPos[sortedIndex]);
double3 vel = make_double3(oldVel[sortedIndex]);
double rho = Rho[sortedIndex];
double3 force = make_double3(0.0);
int neighborIndex = Neighbors[MAX_NEIGHBOR * sortedIndex + threadIdx.x];
int _numNeighbor = numNeighbors[sortedIndex];
double3 pos2 = make_double3(oldPos[neighborIndex]);
double vol2 = Vol[neighborIndex];
_s[threadIdx.x][0] = BondStretch[sortedIndex*256 + threadIdx.x];
_s[threadIdx.x][1] = Mu [sortedIndex*256 + threadIdx.x];
_s[threadIdx.x][2] = 0.0;
_s[threadIdx.x][3] = 0.0;
_s[threadIdx.x][4] = 0.0;
__syncthreads();
if (threadIdx.x < _numNeighbor) {
//if (Type[neighborIndex] != -1) { // SJLee: should I remove lost particle here?
double3 relPos = pos2 - pos;
double dist = length(relPos);
double _delta = params.horizon;
double _c = 18.*params.modBulk / (CUDART_PI * _delta*_delta*_delta*_delta);
force = _c * _s[threadIdx.x][0] * _s[threadIdx.x][1] * vol2 * relPos / dist;
_s[threadIdx.x][2] = force.x;
_s[threadIdx.x][3] = force.y;
_s[threadIdx.x][4] = force.z;
//}
}
__syncthreads();
int threadIdx2;
// parallel reduction - WARNING!! numThread should be fixed to 256 !!!
if (threadIdx.x < 128) {
threadIdx2 = threadIdx.x + 128;
if (threadIdx2 < _numNeighbor) {
_s[threadIdx.x][2] = _s[threadIdx.x][2] + _s[threadIdx2][2];
_s[threadIdx.x][3] = _s[threadIdx.x][3] + _s[threadIdx2][3];
_s[threadIdx.x][4] = _s[threadIdx.x][4] + _s[threadIdx2][4];
}
__syncthreads();
}
if (threadIdx.x < 64) {
threadIdx2 = threadIdx.x + 64;
if (threadIdx2 < _numNeighbor) {
_s[threadIdx.x][2] = _s[threadIdx.x][2] + _s[threadIdx2][2];
_s[threadIdx.x][3] = _s[threadIdx.x][3] + _s[threadIdx2][3];
_s[threadIdx.x][4] = _s[threadIdx.x][4] + _s[threadIdx2][4];
}
__syncthreads();
}
if (threadIdx.x < 32) {
threadIdx2 = threadIdx.x + 32;
if (threadIdx2 < _numNeighbor) {
_s[threadIdx.x][2] = _s[threadIdx.x][2] + _s[threadIdx2][2];
_s[threadIdx.x][3] = _s[threadIdx.x][3] + _s[threadIdx2][3];
_s[threadIdx.x][4] = _s[threadIdx.x][4] + _s[threadIdx2][4];
}
__syncthreads();
}
force.x = _s[0][2];
force.y = _s[0][3];
force.z = _s[0][4];
// write new velocity back to original unsorted location
DVel_Dt[sortedIndex] = make_double4(force / rho, 0.0);
}
```