Hi everybody!
I’m having problems with shared memory. I’m trying to multiply many small square matrices. In general here is the forumla for computing ONLY ONE result:
X1 = F(L0 * X0 * R0 + B0)
X2 = F(L1 * X1 * R1 + B1)
…
Result = F(Ln * Xn * Rn + Bn)
Every ‘L’, ‘X’, ‘R’, ‘B’ matrix dimensions are equal, square and have one of the following sizes: 4x4, 8x8, 16x16, 32x32.
F - is a non linear function that is applied to each element of a matrix
As you can see the computation of ‘Result’ is sequential, but there are many Results to compute.
I see this problem solved by putting as many matrices as possible to shared memory and to multiply them there.
I have come out with the following source:
[codebox]#include <stdio.h>
#include <cutil_inline.h>
#define StateRows 16
#define StateCols 16
#define LayersCount 4
#define SpiciesCount 256
#define SharedMemSize (16384)
#define SharedMemFloats (SharedMemSize / sizeof(float))
#define ThreadsPerBlock 512
#define SpiciesPerBlock (SharedMemSize / (2 * StateRows * StateCols * sizeof(float)))
#define Cast(Object, Type) ((Type)(Object))
typedef char Byte;
typedef struct
{
float s[SpiciesCount][StateRows][StateCols];
float l[LayersCount][SpiciesCount][StateRows][StateCols];
float r[LayersCount][SpiciesCount][StateRows][StateCols];
float b[LayersCount][SpiciesCount][StateRows][StateCols];
} Spicies;
typedef struct
{
float a[SpiciesPerBlock][StateRows][StateCols];
float b[SpiciesPerBlock][StateRows][StateCols];
} Multiplication;
global void VecAdd(void* globals)
{
__shared__ Byte SharedMem[SharedMemSize];
unsigned int localIndex = threadIdx.x;
unsigned int globalIndex = blockIdx.x * (sizeof(float[SpiciesPerBlock][StateRows][StateCols]) / sizeof(float));
while (localIndex < (sizeof(float[SpiciesPerBlock][StateRows][StateCols]) / sizeof(float)))
{
Cast(Cast(SharedMem, Multiplication*)->a, float *)[localIndex] = Cast(Cast(globals, Spicies*)->s[globalIndex], float *)[localIndex];
Cast(Cast(SharedMem, Multiplication*)->b, float *)[localIndex] = Cast(Cast(globals, Spicies*)->l[globalIndex], float *)[localIndex];
localIndex += blockDim.x;
}
__syncthreads();
// for debug purposes
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i == 0)
{
for (i = 0; i < SharedMemFloats; i++)
{
Cast(globals, float *)[i] = Cast(SharedMem, float *)[i];
}
}
}
int main(int argc, char** argv)
{
float* h_data;
float* d_data;
int n = 32768;
size_t size = n * sizeof(float);
h_data = (float*)malloc(size);
// for (int i = 0; i < n; i++) h_data[i] = i;
cutilSafeCall(cudaMalloc((void**)&d_data, size));
cutilSafeCall(cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice));
int threadsPerBlock = 512;
int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_data);
cutilSafeCall(cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost));
for (int i = 0; i < n; i++)
{
printf("%f ", h_data[i]);
}
if (h_data) free(h_data);
if (d_data) cudaFree(d_data);
cutilSafeCall(cudaThreadExit());
}
[/codebox]
which says “ptxas error : Entry function ‘_Z6VecAddPv’ uses too much shared data (0x4004 bytes + 0x10 bytes system, 0x4000 max)” it seems like there is some reserved data is being added to shared memory, how to get away with that?