Shared Memory and Matrix

Hi, I need your help.

I need calculate min value in a matrix that rapresent an image so I thought use to shred memory : Each block on the device evaluate the min value so I calculate the min of min on host.

I written this kernel function :

__global__ void minKernel(float *matrix_linearized, size_t eelements, size_t width, size_t height, float* minReturn)
{
    int const a = 64; //thread per blocco
    __shared__ float cache[a]; 

float min_value = FLT_MAX; //inizializza il minimo

    int i = threadIdx.x + blockDim.x * blockIdx.x;
    int j = threadIdx.y + blockDim.y * blockIdx.y;

    int indexCache = j + i * N; // linearizzazione matrice
   // int indexCache = j + i;
    //int indexCache = threadIdx.x;

while (i < width && j < height) {

            if (width && height)
                min_value = *(matrix_linearized+ eelements* 0 + 0);
                     float tmp = *(matrix_linearized+ eelements* j + i);

             if (tmp <= min_value)
             {
                 min_value = tmp;
              }
             i += blockDim.x * gridDim.x;
             j += blockDim.y * gridDim.y;
        }

    cache[indexCache] = min_value;

__syncthreads();

    int k = (blockDim.x) / 2;
    int z = (blockDim.y) / 2;

    while (k != 0 && z != 0) 
    {

        int offset = z + k * N;
        //int offset = z + k;
        if (indexCache < offset) //z+k*N : linearizzazione matrice
        {
            if (cache[indexCache] > cache[indexCache + offset])
            {
                cache[indexCache] = cache[indexCache + offset];
            }
        }
        __syncthreads();
        k /= 2;
        z /= 2;
    }

    if (indexCache == 0)
    {
         minReturn[blockIdx.x] = cache[indexCache];
            }
}

The Main Code :

void  mini(float *buffer, size_t elementsPerLine, size_t width, size_t height, int size_buffer) //return min value
{       

float* buffer_d = NULL;
    size_t size_buffer_d = size_buffer*sizeof(float);
    cudaError_t error = cudaMalloc((void**)&buffer_d, size_buffer_d);
    if (error != cudaSuccess)
    {
        fprintf(stderr, "Unable to allocate buffer_d memory: %s\n", cudaGetErrorString(error));
    }
    cudaMemcpy(buffer_d, buffer, size_buffer_d, cudaMemcpyHostToDevice);

dim3 threadBlocks(8,8); //64 Threade per blocco
    dim3 blocksPerGrid (width / threadBlocks.x, height / threadBlocks.y);
    
    float* minOutput_d = NULL;
    error = cudaMalloc((void**)&minOutput_d, blocksPerGrid.x*sizeof(float));
    if (error != cudaSuccess)
    {
        fprintf(stderr, "Unable to allocate buffer_d memory: %s\n", cudaGetErrorString(error));
    }

    minKernel << <blocksPerGrid, threadBlocks >> >(buffer_d, elementsPerLine, width, height, minOutput_d);

float* minOutput_h = (float*)malloc (blocksPerGrid.x*sizeof(float));
   
    cudaMemcpy(minOutput_h, minOutput_d, blocksPerGrid.x*sizeof(float), cudaMemcpyDeviceToHost);
   
    for (int  i = 0; i < blocknum.x; i++)
    {
        printf("A => %f", minOutput_h[i]);
    }

//printf("\nresult : %3.2f \n", minOutput_h[0]);

cudaFree(buffer_d);
    free(minOutput_h);

}

But my code doesn’t work and I don’t know the reason.

One problem is certainly the IndexCache but I don’t know how evaluate.

Could you help me, please?

Hi,
I’ve solved my problem in this way :D , what do you think?:

FIND MAX ELEMENT IN MATRIX LINEARIZED

const int N = 16;
const int threadPerBlock =4 ;

__global__ void maxKernel(float *buffer, float* maxReturn)//buffer is matrix linearized
{
    __shared__ float cache[threadPerBlock*threadPerBlock];

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    int tyd = threadIdx.y + blockIdx.y * blockDim.y;

    int cacheIndex = threadIdx.y + threadIdx.x*blockDim.y; 

    float max_value = FLT_MIN;

    int testIndex = tyd + tid * blockDim.y; // 0 - 255 ;
  
    while (testIndex < N) 
    {
        if (buffer[testIndex]>max_value)
        {
            max_value = buffer[testIndex];
        }
        tid += blockDim.x * gridDim.x;
        tyd += blockDim.y * gridDim.y;
        testIndex += tyd + tid * blockDim.y;
    }

    // set the cache values
    cache[cacheIndex] = max_value;
    // synchronize threads in this block
    __syncthreads();
    // for reductions, threadsPerBlock must be a power of 2
    // because of the following code

int i = blockDim.x*blockDim.y/2; //square matrix
    while (i != 0 ) {
        if (cacheIndex < i )
        {
            if (cache[cacheIndex] < cache[cacheIndex + i])
            {
                cache[cacheIndex] = cache[cacheIndex + i];
            }
        }

        __syncthreads();
        i /= 2;
    }

if (cacheIndex == 0)
    {
        maxReturn[blockIdx.x] = cache[0];
    }
}