Hi everybody!

I’m having problems with shared memory. I’m trying to multiply many small square matrices. In general here is the forumla for computing ONLY ONE result:

X1 = F(L0 * X0 * R0 + B0)

X2 = F(L1 * X1 * R1 + B1)

…

Result = F(Ln * Xn * Rn + Bn)

Every ‘L’, ‘X’, ‘R’, ‘B’ matrix dimensions are equal, square and have one of the following sizes: 4x4, 8x8, 16x16, 32x32.

F - is a non linear function that is applied to each element of a matrix

As you can see the computation of ‘Result’ is sequential, but there are many Results to compute.

I see this problem solved by putting as many matrices as possible to shared memory and to multiply them there.

I have come out with the following source:

[codebox]#include <stdio.h>

#include <cutil_inline.h>

#define StateRows 16

#define StateCols 16

#define LayersCount 4

#define SpiciesCount 256

#define SharedMemSize (16384)

#define SharedMemFloats (SharedMemSize / sizeof(float))

#define ThreadsPerBlock 512

#define SpiciesPerBlock (SharedMemSize / (2 * StateRows * StateCols * sizeof(float)))

#define Cast(Object, Type) ((Type)(Object))

typedef char Byte;

typedef struct

{

```
float s[SpiciesCount][StateRows][StateCols];
float l[LayersCount][SpiciesCount][StateRows][StateCols];
float r[LayersCount][SpiciesCount][StateRows][StateCols];
float b[LayersCount][SpiciesCount][StateRows][StateCols];
```

} Spicies;

typedef struct

{

```
float a[SpiciesPerBlock][StateRows][StateCols];
float b[SpiciesPerBlock][StateRows][StateCols];
```

} Multiplication;

**global** void VecAdd(void* globals)

{

```
__shared__ Byte SharedMem[SharedMemSize];
unsigned int localIndex = threadIdx.x;
unsigned int globalIndex = blockIdx.x * (sizeof(float[SpiciesPerBlock][StateRows][StateCols]) / sizeof(float));
while (localIndex < (sizeof(float[SpiciesPerBlock][StateRows][StateCols]) / sizeof(float)))
{
Cast(Cast(SharedMem, Multiplication*)->a, float *)[localIndex] = Cast(Cast(globals, Spicies*)->s[globalIndex], float *)[localIndex];
Cast(Cast(SharedMem, Multiplication*)->b, float *)[localIndex] = Cast(Cast(globals, Spicies*)->l[globalIndex], float *)[localIndex];
localIndex += blockDim.x;
}
__syncthreads();
// for debug purposes
```

int i = blockDim.x * blockIdx.x + threadIdx.x;

```
if (i == 0)
{
for (i = 0; i < SharedMemFloats; i++)
{
Cast(globals, float *)[i] = Cast(SharedMem, float *)[i];
}
}
```

}

int main(int argc, char** argv)

{

```
float* h_data;
float* d_data;
```

int n = 32768;

```
size_t size = n * sizeof(float);
```

h_data = (float*)malloc(size);

// for (int i = 0; i < n; i++) h_data[i] = i;

cutilSafeCall(cudaMalloc((void**)&d_data, size));

cutilSafeCall(cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice));

int threadsPerBlock = 512;

```
int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_data);
```

cutilSafeCall(cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost));

```
for (int i = 0; i < n; i++)
{
printf("%f ", h_data[i]);
}
if (h_data) free(h_data);
if (d_data) cudaFree(d_data);
```

cutilSafeCall(cudaThreadExit());

}

[/codebox]

which says “ptxas error : Entry function ‘_Z6VecAddPv’ uses too much shared data (0x4004 bytes + 0x10 bytes system, 0x4000 max)” it seems like there is some reserved data is being added to shared memory, how to get away with that?