Histogram without Atomics

Hi all,

I’m trying to write a code for computing the histogram (without Atomics)

GPU will compute partial histograms, and then , CPU will add the partial histograms to get the main result.

here is the code:

// Includes

#include <stdio.h>

#include <cutil_inline.h>

////////////// global variables initializing ///////////

int N=10;	//////Number of input elements

int H=6;	///Histogram values

int T=4;	///Threads per block (Blocksize)

int B=4;	///blocks per grid

// Input Array Variables

int* h_In = NULL;

int* d_In = NULL;

int* CPU_result = NULL;

int* memset_h = NULL;

// Output Array

int* h_Out = NULL;

int* d_Out = NULL;

int* d_memset=NULL;

// Functions

void Cleanup(void);

void RandomInit(int*, int, int);

void CPUHistogram(int*, int , int, int*);	

void ParseArguments(int, char**);

/////////////////////////////////// Device code//////////////////////////////////

__global__ void GPU_histogram(int* g_idata, int* g_odata, int n, int h, int* histo_d )

{

  // shared memory size declared at kernel launch

  extern __shared__ int sdata[]; 

unsigned int tid = threadIdx.x; 

  unsigned int globalid = blockIdx.x*blockDim.x + threadIdx.x; 

// For thread ids greater than data space

 if (globalid < n) {

    sdata[tid] = g_idata[globalid]; 

  }

  else {

     sdata[tid] = 0;  // Case of extra threads above N

  }

__syncthreads();

g_odata[(blockIdx.x*h)+sdata[tid]]++;

__syncthreads();

}	

/////////////////////////// Host code/////////////////////////////////////////////

int main(int argc, char** argv)

{

    ParseArguments(argc, argv);

printf("\nNo. of Numbers: %d",N);

printf("\tHistogram Value %d",H);

printf("\tThreads per Block: %d",T);

printf("\tBlocks per Grid: %d\n",B);

size_t in_size = N * sizeof(int);

// Set the kernel arguments

int threadsPerBlock = T;

int sharedMemSize = threadsPerBlock * sizeof(int);

int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

printf("\n\tNO of blocks:  %d\n",blocksPerGrid);

int P=H*blocksPerGrid ;

size_t out_size = P * sizeof(int);

// Allocate input vectors h_In in host memory

h_In = (int*)malloc(in_size);

if (h_In == 0) 

Cleanup();

//for histo in GPU 

memset_h = (int*)malloc(out_size);

if (memset_h == 0) 

Cleanup();

size_t cpu_hist_size = H * sizeof(int);//CPU_result has H values(No. of Histogram elements)....for CPU function

	// Allocate input vector CPU_result in host memory

CPU_result = (int*)malloc(cpu_hist_size);

if (CPU_result == 0) 

Cleanup();

// Initialize input vectors

RandomInit(h_In, N, H);	

//Print random values

printf("\n");

for(int i=0;i<N;i++)

printf("%d\t", h_In[i]);

// Allocate host output

h_Out = (int*)malloc(out_size);

if (h_Out == 0) 

Cleanup();

	//CPU histogram function

CPUHistogram(h_In, N, H, CPU_result);	

printf("\n");

for(int i=0;i<H;i++)

printf("%d\t", CPU_result[i]);

printf("\n");

	//make sure No. of input element is the same as histogram elements count

long histoCount=0;

for (int i=0;i<H;i++)

  histoCount +=CPU_result[i];

  printf("\nsum of Histogram elements: %ld\n", histoCount);

  if (histoCount!=N)

  printf("\nFail");

// Allocate vectors in device memory

    cutilSafeCall( cudaMalloc((void**)&d_In, in_size) );

    cutilSafeCall( cudaMalloc((void**)&d_Out, out_size) );

cudaMemset(d_Out,0,out_size);////////////set output of the GPU to zero

cutilSafeCall( cudaMalloc((void**)&d_memset, out_size) );

// Copy h_In from host memory to device memory

cudaMemcpy(d_In,h_In,in_size,cudaMemcpyHostToDevice);

cudaMemcpy(d_memset,memset_h,out_size,cudaMemcpyHostToDevice);

// Invoke kernel

    GPU_histogram<<<blocksPerGrid, threadsPerBlock, sharedMemSize>>>(d_In, d_Out, N, H,  d_memset);

    cutilCheckMsg("kernel launch failure");

    cutilSafeCall( cudaThreadSynchronize() ); // Have host wait for kernel

// copy results back from GPU to the h_Out

cudaMemcpy(h_Out,d_Out,out_size,cudaMemcpyDeviceToHost);

// showing partial histograms

for (int i=0;i<P;i++)

printf("%d\t",h_Out[i]);

printf("\n");

Cleanup();

}

/////////CleanUp function/////////

void Cleanup(void)

{

    // Free device memory

    if (d_In)

        cudaFree(d_In);

    if (d_Out)

        cudaFree(d_Out);

// Free host memory

    if (h_In)

        free(h_In);

    if (h_Out)

        free(h_Out);

if (CPU_result)

        free(CPU_result);

if (memset_h)

        free(memset_h);

cutilSafeCall( cudaThreadExit() );

exit(0);

}

///////////// Allocates an array with random int entries./////////////////////

void RandomInit(int* data, int n, int H)

{

	srand(1);

    for (int i = 0; i < n; i++)

        data[i] = rand() % H;

}

//////////////////////CPU calculation/////////////////////////////////

void CPUHistogram(int* data, int n,int h, int* hist)

{

	//set histogram elements to zero

for (int i=0;i<h;i++)

  hist[i]=0;

	//compute the histogram  

  for (int i=0;i<n;i++)

  hist[data[i]]++;

}

//////////////////// Parse program arguments//////////////////////////////////////

void ParseArguments(int argc, char** argv)

{

    for (int i = 0; i < argc; ++i) {

        if (strcmp(argv[i], "--number") == 0 || strcmp(argv[i], "-number") == 0) {

            N = atoi(argv[i+1]);			

	    i = i + 1;

        }

	if (strcmp(argv[i], "--hist") == 0 || strcmp(argv[i], "-hist") == 0) {

            H = atoi(argv[i+1]);			

	    i = i + 1;	 

        }

        if (strcmp(argv[i], "--thread") == 0 || strcmp(argv[i], "-thread") == 0) {

            T = atoi(argv[i+1]);			

	    i = i + 1;	 

        }

	if (strcmp(argv[i], "--block") == 0 || strcmp(argv[i], "-block") == 0) {

            B = atoi(argv[i+1]);

	    i = i + 1;	 			

        }

    }

}

the problem is in GPU : it only shows the arrays which are present in input array (not compute number of those arrays)

for example: input: 1 4 3 1 5 1 4 0 3 1

output of the GPU: 0 1 0 1 1 0 1 1 0 0 1 1 1 1 0 1 0 0

it shows in the first block, we have input[1] and input[3] and input[4]…(doesn’t show partial histogram)

what’s wrong with GPU calculation?