Hi, I need your help.
I need calculate min value in a matrix that rapresent an image so I thought use to shred memory : Each block on the device evaluate the min value so I calculate the min of min on host.
I written this kernel function :
__global__ void minKernel(float *matrix_linearized, size_t eelements, size_t width, size_t height, float* minReturn)
{
int const a = 64; //thread per blocco
__shared__ float cache[a];
float min_value = FLT_MAX; //inizializza il minimo
int i = threadIdx.x + blockDim.x * blockIdx.x;
int j = threadIdx.y + blockDim.y * blockIdx.y;
int indexCache = j + i * N; // linearizzazione matrice
// int indexCache = j + i;
//int indexCache = threadIdx.x;
while (i < width && j < height) {
if (width && height)
min_value = *(matrix_linearized+ eelements* 0 + 0);
float tmp = *(matrix_linearized+ eelements* j + i);
if (tmp <= min_value)
{
min_value = tmp;
}
i += blockDim.x * gridDim.x;
j += blockDim.y * gridDim.y;
}
cache[indexCache] = min_value;
__syncthreads();
int k = (blockDim.x) / 2;
int z = (blockDim.y) / 2;
while (k != 0 && z != 0)
{
int offset = z + k * N;
//int offset = z + k;
if (indexCache < offset) //z+k*N : linearizzazione matrice
{
if (cache[indexCache] > cache[indexCache + offset])
{
cache[indexCache] = cache[indexCache + offset];
}
}
__syncthreads();
k /= 2;
z /= 2;
}
if (indexCache == 0)
{
minReturn[blockIdx.x] = cache[indexCache];
}
}
The Main Code :
void mini(float *buffer, size_t elementsPerLine, size_t width, size_t height, int size_buffer) //return min value
{
float* buffer_d = NULL;
size_t size_buffer_d = size_buffer*sizeof(float);
cudaError_t error = cudaMalloc((void**)&buffer_d, size_buffer_d);
if (error != cudaSuccess)
{
fprintf(stderr, "Unable to allocate buffer_d memory: %s\n", cudaGetErrorString(error));
}
cudaMemcpy(buffer_d, buffer, size_buffer_d, cudaMemcpyHostToDevice);
dim3 threadBlocks(8,8); //64 Threade per blocco
dim3 blocksPerGrid (width / threadBlocks.x, height / threadBlocks.y);
float* minOutput_d = NULL;
error = cudaMalloc((void**)&minOutput_d, blocksPerGrid.x*sizeof(float));
if (error != cudaSuccess)
{
fprintf(stderr, "Unable to allocate buffer_d memory: %s\n", cudaGetErrorString(error));
}
minKernel << <blocksPerGrid, threadBlocks >> >(buffer_d, elementsPerLine, width, height, minOutput_d);
float* minOutput_h = (float*)malloc (blocksPerGrid.x*sizeof(float));
cudaMemcpy(minOutput_h, minOutput_d, blocksPerGrid.x*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < blocknum.x; i++)
{
printf("A => %f", minOutput_h[i]);
}
//printf("\nresult : %3.2f \n", minOutput_h[0]);
cudaFree(buffer_d);
free(minOutput_h);
}
But my code doesn’t work and I don’t know the reason.
One problem is certainly the IndexCache but I don’t know how evaluate.
Could you help me, please?