Hi all,
I’m trying to write a code for computing the histogram (without Atomics)
GPU will compute partial histograms, and then , CPU will add the partial histograms to get the main result.
here is the code:
// Includes
#include <stdio.h>
#include <cutil_inline.h>
////////////// global variables initializing ///////////
int N=10; //////Number of input elements
int H=6; ///Histogram values
int T=4; ///Threads per block (Blocksize)
int B=4; ///blocks per grid
// Input Array Variables
int* h_In = NULL;
int* d_In = NULL;
int* CPU_result = NULL;
int* memset_h = NULL;
// Output Array
int* h_Out = NULL;
int* d_Out = NULL;
int* d_memset=NULL;
// Functions
void Cleanup(void);
void RandomInit(int*, int, int);
void CPUHistogram(int*, int , int, int*);
void ParseArguments(int, char**);
/////////////////////////////////// Device code//////////////////////////////////
__global__ void GPU_histogram(int* g_idata, int* g_odata, int n, int h, int* histo_d )
{
// shared memory size declared at kernel launch
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int globalid = blockIdx.x*blockDim.x + threadIdx.x;
// For thread ids greater than data space
if (globalid < n) {
sdata[tid] = g_idata[globalid];
}
else {
sdata[tid] = 0; // Case of extra threads above N
}
__syncthreads();
g_odata[(blockIdx.x*h)+sdata[tid]]++;
__syncthreads();
}
/////////////////////////// Host code/////////////////////////////////////////////
int main(int argc, char** argv)
{
ParseArguments(argc, argv);
printf("\nNo. of Numbers: %d",N);
printf("\tHistogram Value %d",H);
printf("\tThreads per Block: %d",T);
printf("\tBlocks per Grid: %d\n",B);
size_t in_size = N * sizeof(int);
// Set the kernel arguments
int threadsPerBlock = T;
int sharedMemSize = threadsPerBlock * sizeof(int);
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
printf("\n\tNO of blocks: %d\n",blocksPerGrid);
int P=H*blocksPerGrid ;
size_t out_size = P * sizeof(int);
// Allocate input vectors h_In in host memory
h_In = (int*)malloc(in_size);
if (h_In == 0)
Cleanup();
//for histo in GPU
memset_h = (int*)malloc(out_size);
if (memset_h == 0)
Cleanup();
size_t cpu_hist_size = H * sizeof(int);//CPU_result has H values(No. of Histogram elements)....for CPU function
// Allocate input vector CPU_result in host memory
CPU_result = (int*)malloc(cpu_hist_size);
if (CPU_result == 0)
Cleanup();
// Initialize input vectors
RandomInit(h_In, N, H);
//Print random values
printf("\n");
for(int i=0;i<N;i++)
printf("%d\t", h_In[i]);
// Allocate host output
h_Out = (int*)malloc(out_size);
if (h_Out == 0)
Cleanup();
//CPU histogram function
CPUHistogram(h_In, N, H, CPU_result);
printf("\n");
for(int i=0;i<H;i++)
printf("%d\t", CPU_result[i]);
printf("\n");
//make sure No. of input element is the same as histogram elements count
long histoCount=0;
for (int i=0;i<H;i++)
histoCount +=CPU_result[i];
printf("\nsum of Histogram elements: %ld\n", histoCount);
if (histoCount!=N)
printf("\nFail");
// Allocate vectors in device memory
cutilSafeCall( cudaMalloc((void**)&d_In, in_size) );
cutilSafeCall( cudaMalloc((void**)&d_Out, out_size) );
cudaMemset(d_Out,0,out_size);////////////set output of the GPU to zero
cutilSafeCall( cudaMalloc((void**)&d_memset, out_size) );
// Copy h_In from host memory to device memory
cudaMemcpy(d_In,h_In,in_size,cudaMemcpyHostToDevice);
cudaMemcpy(d_memset,memset_h,out_size,cudaMemcpyHostToDevice);
// Invoke kernel
GPU_histogram<<<blocksPerGrid, threadsPerBlock, sharedMemSize>>>(d_In, d_Out, N, H, d_memset);
cutilCheckMsg("kernel launch failure");
cutilSafeCall( cudaThreadSynchronize() ); // Have host wait for kernel
// copy results back from GPU to the h_Out
cudaMemcpy(h_Out,d_Out,out_size,cudaMemcpyDeviceToHost);
// showing partial histograms
for (int i=0;i<P;i++)
printf("%d\t",h_Out[i]);
printf("\n");
Cleanup();
}
/////////CleanUp function/////////
void Cleanup(void)
{
// Free device memory
if (d_In)
cudaFree(d_In);
if (d_Out)
cudaFree(d_Out);
// Free host memory
if (h_In)
free(h_In);
if (h_Out)
free(h_Out);
if (CPU_result)
free(CPU_result);
if (memset_h)
free(memset_h);
cutilSafeCall( cudaThreadExit() );
exit(0);
}
///////////// Allocates an array with random int entries./////////////////////
void RandomInit(int* data, int n, int H)
{
srand(1);
for (int i = 0; i < n; i++)
data[i] = rand() % H;
}
//////////////////////CPU calculation/////////////////////////////////
void CPUHistogram(int* data, int n,int h, int* hist)
{
//set histogram elements to zero
for (int i=0;i<h;i++)
hist[i]=0;
//compute the histogram
for (int i=0;i<n;i++)
hist[data[i]]++;
}
//////////////////// Parse program arguments//////////////////////////////////////
void ParseArguments(int argc, char** argv)
{
for (int i = 0; i < argc; ++i) {
if (strcmp(argv[i], "--number") == 0 || strcmp(argv[i], "-number") == 0) {
N = atoi(argv[i+1]);
i = i + 1;
}
if (strcmp(argv[i], "--hist") == 0 || strcmp(argv[i], "-hist") == 0) {
H = atoi(argv[i+1]);
i = i + 1;
}
if (strcmp(argv[i], "--thread") == 0 || strcmp(argv[i], "-thread") == 0) {
T = atoi(argv[i+1]);
i = i + 1;
}
if (strcmp(argv[i], "--block") == 0 || strcmp(argv[i], "-block") == 0) {
B = atoi(argv[i+1]);
i = i + 1;
}
}
}
the problem is in GPU : it only shows the arrays which are present in input array (not compute number of those arrays)
for example: input: 1 4 3 1 5 1 4 0 3 1
output of the GPU: 0 1 0 1 1 0 1 1 0 0 1 1 1 1 0 1 0 0
it shows in the first block, we have input[1] and input[3] and input[4]…(doesn’t show partial histogram)
what’s wrong with GPU calculation?