working with __shared__ memory

I’m solving the interaction problem of N particles.
I have a particle set V (with property Position) and a set of constraints E (that describes the interaction between the particles). The each position from E is for the interaction between 2 particles and the each particle can interacte with N other particles.
So one position “e” from E contains 2 positions from V. And any position “v” from V contains N positions from E.

I try the following solution:

texture<int2> texVertexToEdge;
texture<float4> texVertexPosition;
texture<float> texEdgeKoef1;
texture<float4> texEdgeKoef2;

__device__ float4 Handle(int edgeIndex)
{
	float4 force = make_float4(0.0, 0.0, 0.0, 0.0);
	float koef1 = tex1Dfetch(texEdgeKoef1, edgeIndex);
	force4 koef2 = tex1Dfetch(texEdgeKoef2, edgeIndex);
	force = koef2*koef1;
	return force;
}

__global__
void kernel(float4* offset, int numVertices)
{
	int idx = threadIdx.x + blockIdx.x*blockDim.x;
	while(idx < numVertices)
	{
		//edgeIndex.x - first edge index for current vertex
		//edgeIndex.y - edge element's num for current vertex
		int2 edgeIndex = tex1Dfetch(texVertexToEdge, idx); 
		float4 force = make_float4(0.0, 0.0, 0.0, 0.0);
		for (int index = edgeIndex.x; index < edgeIndex.x + edgeIndex.y; index++)
		{
			force += Handle(index);
		}
		offset[idx] = force;
		idx += blockDim.x * gridDim.x
	}
}

some in main

void main()
{
	.....
	kernel<<<512, 128>>>(offsetVertex, numVertices);
	....
}

Then I decided to calculate the impact on the particle not by using the single thread but with whole block. And store the intermediate results in shared memory:

#define threadsPerBlock 10

__global__ 
void kernel(float* offset, int numVertices)
{
	__shared__ float4 cacheForce[threadsPerBlock];
	int idx = blockIdx.x;
	while(idx < numVertices)
	{
		int2 edgeIndex = tex1Dfetch(texVertexToEdge, idx); 
		int cacheIndex = threadIdx.x;
		if (cacheIndex < edgeIndex.y)
		{
			cacheForce[cacheIndex] = Handle(edgeIndex.x + cacheIndex);
		}
		__syncthreads();
		if (cacheIndex == 0)
		{
			float4 force = make_float4(0.0, 0.0, 0.0, 0.0);
			for (int i = 0; i < edgeIndex.y; i++)
			{
				force += cacheForce[i];
			}
			offset[idx] = force;
		}
		__syncthreads();
		idx += gridDim.x;
	}
}

some in main

void main()
{
	.....
	kernel<<<512, 128>>>(offsetVertex, threadsPerBlock);
	....
}

To my surprise this implementation is 3 times slower than previous. shared is known as really fast memory in literature. So that’s might be my mistake. But I don’t know what’s the problem with my code. I rewrite the original example and I could make a mistake with shared using. Can you check?

Are we allowed to use void main() in CUDA?

Also, I’m not sure I understand your summary of the algorithm… E is a set of constraints? Or is it the type of interaction between particles?

Also also, try nvvp or nvprof. They are profilers that can show you where your code is bottlenecked.

a)
kernel<<<512, 128>>>

#define threadsPerBlock 10

int cacheIndex = threadIdx.x;
if (cacheIndex < edgeIndex.y)
{
cacheForce[cacheIndex] = …;
}

???

b)

int idx = threadIdx.x + blockIdx.x*blockDim.x;
while(idx < numVertices)
offset[idx] = force;

vs

int idx = blockIdx.x;
while(idx < numVertices)
offset[idx] = force;

???

you have successfully confused me