Hi,
In the scope of image processing I would need a fast pix values histogram from my images,
I’m desperately trying to use CUB Histogram rather than atomicHistogram as it is probalby much faster.
I derived a test piece of code by copying the provided sample in the CUB library but couldn’t get sensible output. neither found on the web any other example to mimic.
Here is my code
#include <stdio.h>
#include <time.h>
#include <math.h>
#include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
#define TPB 16
#define RANGE 16
// Create and compute histogram
int main (int argc, char * argv[])
{
// Declare, allocate, and initialize device-accessible pointers for input samples
// and output histogram
int num_samples = 10;
int *d_samples;
int *d_histogram;
int num_levels = 10; // (10 level boundaries for 9 bins)
int lower_level = 0; // lower sample value boundary of lowest bin)
int upper_level = 9; // (upper sample value boundary of upper bin)
//Initialize vector with random values
int h_samples[num_samples] = {1,2,3,4,5,6,7,8,9,10};
printf("Samples\n");
for (int i = 0; i < num_samples; i++)
printf ("%d ", h_samples[i]);
cudaMalloc (&d_samples, num_samples * sizeof(int));
cudaMemcpy (&d_samples, h_samples, num_samples * sizeof(int), cudaMemcpyHostToDevice);
//Allocate device and host memory for histogram
cudaMalloc (&d_histogram, (num_levels+1) * sizeof(int));
//int *h_histogram = (int *) malloc ((num_levels+1) * sizeof(int));
int h_histogram[num_levels];
// Determine temporary device storage requirements
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples);
// Allocate temporary storage
cudaMalloc(&d_temp_storage, temp_storage_bytes);
// Compute histograms
cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples);
cudaMemcpy (&h_histogram, d_histogram, (num_levels+1) * sizeof(int), cudaMemcpyDeviceToHost);
printf("\nHistogram\n");
for (int i = 0; i <= num_levels; i++)
printf ("%d ", h_histogram[i]);
// Cleanup and closing
cudaFree(d_samples); cudaFree(d_histogram); cudaFree(d_temp_storage);
printf("\n");
return 0;
}
Here is the ouput with wrong histogram values
Samples
1 2 3 4 5 6 7 8 9 10
Histogram
10 0 0 0 0 0 0 0 0 0 0
Obviously it is not what is expected but cannot get what is wrong…
Please help, many thanks in advance