# Shared Memory and Matrix

Hi, I need your help.

I need calculate min value in a matrix that rapresent an image so I thought use to shred memory : Each block on the device evaluate the min value so I calculate the min of min on host.

I written this kernel function :

``````__global__ void minKernel(float *matrix_linearized, size_t eelements, size_t width, size_t height, float* minReturn)
{
int const a = 64; //thread per blocco
__shared__ float cache[a];

float min_value = FLT_MAX; //inizializza il minimo

int i = threadIdx.x + blockDim.x * blockIdx.x;
int j = threadIdx.y + blockDim.y * blockIdx.y;

int indexCache = j + i * N; // linearizzazione matrice
// int indexCache = j + i;
//int indexCache = threadIdx.x;

while (i < width && j < height) {

if (width && height)
min_value = *(matrix_linearized+ eelements* 0 + 0);
float tmp = *(matrix_linearized+ eelements* j + i);

if (tmp <= min_value)
{
min_value = tmp;
}
i += blockDim.x * gridDim.x;
j += blockDim.y * gridDim.y;
}

cache[indexCache] = min_value;

int k = (blockDim.x) / 2;
int z = (blockDim.y) / 2;

while (k != 0 && z != 0)
{

int offset = z + k * N;
//int offset = z + k;
if (indexCache < offset) //z+k*N : linearizzazione matrice
{
if (cache[indexCache] > cache[indexCache + offset])
{
cache[indexCache] = cache[indexCache + offset];
}
}
k /= 2;
z /= 2;
}

if (indexCache == 0)
{
minReturn[blockIdx.x] = cache[indexCache];
}
}
``````

The Main Code :

``````void  mini(float *buffer, size_t elementsPerLine, size_t width, size_t height, int size_buffer) //return min value
{

float* buffer_d = NULL;
size_t size_buffer_d = size_buffer*sizeof(float);
cudaError_t error = cudaMalloc((void**)&buffer_d, size_buffer_d);
if (error != cudaSuccess)
{
fprintf(stderr, "Unable to allocate buffer_d memory: %s\n", cudaGetErrorString(error));
}
cudaMemcpy(buffer_d, buffer, size_buffer_d, cudaMemcpyHostToDevice);

dim3 blocksPerGrid (width / threadBlocks.x, height / threadBlocks.y);

float* minOutput_d = NULL;
error = cudaMalloc((void**)&minOutput_d, blocksPerGrid.x*sizeof(float));
if (error != cudaSuccess)
{
fprintf(stderr, "Unable to allocate buffer_d memory: %s\n", cudaGetErrorString(error));
}

minKernel << <blocksPerGrid, threadBlocks >> >(buffer_d, elementsPerLine, width, height, minOutput_d);

float* minOutput_h = (float*)malloc (blocksPerGrid.x*sizeof(float));

cudaMemcpy(minOutput_h, minOutput_d, blocksPerGrid.x*sizeof(float), cudaMemcpyDeviceToHost);

for (int  i = 0; i < blocknum.x; i++)
{
printf("A => %f", minOutput_h[i]);
}

//printf("\nresult : %3.2f \n", minOutput_h[0]);

cudaFree(buffer_d);
free(minOutput_h);

}
``````

But my code doesn’t work and I don’t know the reason.

One problem is certainly the IndexCache but I don’t know how evaluate.

Could you help me, please?

Hi,
I’ve solved my problem in this way :D , what do you think?:

FIND MAX ELEMENT IN MATRIX LINEARIZED

``````const int N = 16;
const int threadPerBlock =4 ;

__global__ void maxKernel(float *buffer, float* maxReturn)//buffer is matrix linearized
{

int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tyd = threadIdx.y + blockIdx.y * blockDim.y;

float max_value = FLT_MIN;

int testIndex = tyd + tid * blockDim.y; // 0 - 255 ;

while (testIndex < N)
{
if (buffer[testIndex]>max_value)
{
max_value = buffer[testIndex];
}
tid += blockDim.x * gridDim.x;
tyd += blockDim.y * gridDim.y;
testIndex += tyd + tid * blockDim.y;
}

// set the cache values
cache[cacheIndex] = max_value;
// synchronize threads in this block
// for reductions, threadsPerBlock must be a power of 2
// because of the following code

int i = blockDim.x*blockDim.y/2; //square matrix
while (i != 0 ) {
if (cacheIndex < i )
{
if (cache[cacheIndex] < cache[cacheIndex + i])
{
cache[cacheIndex] = cache[cacheIndex + i];
}
}