I’m solving the interaction problem of N particles.
I have a particle set V (with property Position) and a set of constraints E (that describes the interaction between the particles). The each position from E is for the interaction between 2 particles and the each particle can interacte with N other particles.
So one position “e” from E contains 2 positions from V. And any position “v” from V contains N positions from E.
I try the following solution:
texture<int2> texVertexToEdge;
texture<float4> texVertexPosition;
texture<float> texEdgeKoef1;
texture<float4> texEdgeKoef2;
__device__ float4 Handle(int edgeIndex)
{
float4 force = make_float4(0.0, 0.0, 0.0, 0.0);
float koef1 = tex1Dfetch(texEdgeKoef1, edgeIndex);
force4 koef2 = tex1Dfetch(texEdgeKoef2, edgeIndex);
force = koef2*koef1;
return force;
}
__global__
void kernel(float4* offset, int numVertices)
{
int idx = threadIdx.x + blockIdx.x*blockDim.x;
while(idx < numVertices)
{
//edgeIndex.x - first edge index for current vertex
//edgeIndex.y - edge element's num for current vertex
int2 edgeIndex = tex1Dfetch(texVertexToEdge, idx);
float4 force = make_float4(0.0, 0.0, 0.0, 0.0);
for (int index = edgeIndex.x; index < edgeIndex.x + edgeIndex.y; index++)
{
force += Handle(index);
}
offset[idx] = force;
idx += blockDim.x * gridDim.x
}
}
some in main
void main()
{
.....
kernel<<<512, 128>>>(offsetVertex, numVertices);
....
}
Then I decided to calculate the impact on the particle not by using the single thread but with whole block. And store the intermediate results in shared memory:
#define threadsPerBlock 10
__global__
void kernel(float* offset, int numVertices)
{
__shared__ float4 cacheForce[threadsPerBlock];
int idx = blockIdx.x;
while(idx < numVertices)
{
int2 edgeIndex = tex1Dfetch(texVertexToEdge, idx);
int cacheIndex = threadIdx.x;
if (cacheIndex < edgeIndex.y)
{
cacheForce[cacheIndex] = Handle(edgeIndex.x + cacheIndex);
}
__syncthreads();
if (cacheIndex == 0)
{
float4 force = make_float4(0.0, 0.0, 0.0, 0.0);
for (int i = 0; i < edgeIndex.y; i++)
{
force += cacheForce[i];
}
offset[idx] = force;
}
__syncthreads();
idx += gridDim.x;
}
}
some in main
void main()
{
.....
kernel<<<512, 128>>>(offsetVertex, threadsPerBlock);
....
}
To my surprise this implementation is 3 times slower than previous. shared is known as really fast memory in literature. So that’s might be my mistake. But I don’t know what’s the problem with my code. I rewrite the original example and I could make a mistake with shared using. Can you check?