I want to add the all pixels in an 128x128 image. For me the code I show below is fine, however does not show the correct result. I have the sum by the reduction sum method, which is applied twice. In a first part, all values (pixels) in each block are added and the resulting vector is stored in global memory. In the second part, I add the all values of last vector mentioned.
Can anyone tell me why it does not work properly?
//Este programa realiza el producto punto (dot product) entre dos vectores
const int filas = 128;
const int columnas = 128;
const int N = filas*columnas;
const int threadsPerBlock = 16;
const int blocksPerGrid = (filas+threadsPerBlock-1) / threadsPerBlock;
global void dot( float *a, float c, int width, int heigth)
{
shared float smem[threadsPerBlockthreadsPerBlock];
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int idy = threadIdx.y + blockIdx.y * blockDim.y;
unsigned int index = idy*width + idx;
unsigned int bindex = threadIdx.y * blockDim.y + threadIdx.x;
//each thread copies its pixel of the block to shared memory
smem[bindex] = a[index];
__syncthreads();
//First part of the sum
int i = (blockDim.x*blockDim.y)>>1;
while (i != 0)
{
if (bindex < i)
smem[bindex] += smem[bindex + i];
__syncthreads();
i=i >>1 ;
}
int tid = blockIdx.x + blockIdx.y * blocksPerGrid;
if (bindex == 0)
c[tid] = smem[0];
__syncthreads();
//Second part of the sum
int j = (blocksPerGrid*blocksPerGrid)>>1;
while (j != 0)
{
if (idx < j)
c[idx] += c[idx + j];
__syncthreads();
j = j >>1 ;
}
}
int main(void)
{
float *a, partial_c;
floatdev_a, *dev_partial_c;
// allocate memory on the CPU side
a = (float*)malloc( N*sizeof(float) );
partial_c = (float*)malloc( blocksPerGrid*blocksPerGrid*sizeof(float) );
//cudaDeviceReset();
// allocate the memory on the GPU
HANDLE_ERROR( cudaMalloc( (void**)&dev_a, N*sizeof(float) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_partial_c,blocksPerGrid*blocksPerGrid*sizeof(float) ) );
int k=0;
/////////////////////////////////////////////////////////////////////////////////////
//////////// Read Image from file ///////////////////////////
/////////////////////////////////////////////////////////////////////////////////////
ifstream fe(“C:/Proyecto_CUDA/DatosIn.txt”);
while(!fe.eof())
fe >> a[k++];
fe.close();
/////////////////////////////////////////////////////////////////////////////////////
// copy the arrays ‘a’ and ‘b’ to the GPU
HANDLE_ERROR( cudaMemcpy( dev_a, a, N*sizeof(float),cudaMemcpyHostToDevice ) );
dim3 grid(blocksPerGrid, blocksPerGrid, 1);
dim3 threads(threadsPerBlock, threadsPerBlock,1);
dot<<<grid, threads>>>( dev_a,dev_partial_c, columnas, filas);
// copy the array 'c' back from the GPU to the CPU
HANDLE_ERROR( cudaMemcpy( partial_c, dev_partial_c, blocksPerGrid * blocksPerGrid * sizeof(float),cudaMemcpyDeviceToHost ) );
//Show the sum
cout<<"The sum of image is: "<<partial_c[0]<<endl;
// free memory on the GPU side
cudaFree( dev_a );
cudaFree( dev_partial_c );
// free memory on the CPU side
free( a );
free( partial_c );
getch();
return 0;
}