Hello,
Consider a problem where variables are repeatedly read. Time will be saved when the variables are copied to shared memory for reading rather than repeatedly calling the global memory.
Question:
Is it possible to benefit from shared memory when the problem size is larger than the variable size being copied - if so how can one copy the smaller variable size into each block so that it can be read across many blocks?
ie.
int n = 64;
int blockdim = 32; // block_dim
int g_size = 12;
test<<<n/blockdim,blockdim,g_size*sizeof(float)>>>(global_var,g_size,n);
global void test(float*global_var,int g_size,int n)
{
int tx = threadIdx.x;
int i = threadIdx.x + blockIdx.x * blockDim.x; // size of n
extern shared float sdata;
// fill shared up to var size limit rather than problem size limit
if(i < g_size)
{
sdata[tx] = global_var[i];
printf("share fill %i %f \n",blockIdx.x,sdata[0]);
}
__syncthreads();
// will return empty sdata when 0 < blockIdx.x
printf(“share check %i %f \n”,blockIdx.x,sdata[0]);
}