this is my mini sample code, thanks very much!
my env is:
NVIDIA Nsight Visual Studio Edition Version 5.2
{Build Number 5.2.0.16321}
CUDA8.0 >{CUDA Toolkit 7.5 or 8.0}
GTX1080
Visual Studio 2013
Driver{376.51}>{NVIDIA Display Driver version 376.09}
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include
#include <cuda_runtime.h>
#include <helper_functions.h>
#include <helper_cuda.h>
#include “device_launch_parameters.h”
using namespace std;
// thrust
#include <thrust/host_vector.h> // host vector
#include <thrust/device_vector.h> // device vector
#include <thrust/device_ptr.h> // device ptr
#include <thrust/iterator/counting_iterator.h> // iterator
#include <thrust/transform.h>
#include <thrust/sort.h>
#include <thrust/reduce.h>
#include <thrust/generate.h>
#include <thrust/binary_search.h>
#include <thrust/random.h>
#include <thrust/functional.h>
struct point_to_bucket_index : public thrust::unary_function<float2, unsigned int>
{
// constructor function
host device point_to_bucket_index(unsigned int width, unsigned int height) :w(width), h(height){}
__host__ __device__ unsigned int operator()(float2 p) const
{
// find the raster indices of p's bucket
unsigned int x = static_cast<unsigned int>(p.x * (w - 1));
unsigned int y = static_cast<unsigned int>(p.y * (h - 1));
// return the bucket's linear index
return y * w + x;
}
// member variable
unsigned int w, h;
};
global void TransToFloat2(const float* d_Input1, const float* d_Input2, float2* d_Output, const int nSize, const int nBlocksNumX)
{
const int tid = (blockIdx.y * nBlocksNumX + blockIdx.x) * blockDim.x + threadIdx.x;
if (tid < nSize)
{
d_Output[tid] = make_float2(d_Input1[tid], d_Input2[tid]);
}
}
void MiniLevelCompute(const int nLevel)
{
int nMaxIterNum = 15;
int nWidth = 512;
int nHeight = 512;
int nSlice = 32;
int nLevelWidth = int((nWidth - 1) / pow(2, nLevel)) + 1;
int nLevelHeight = int((nHeight - 1) / pow(2, nLevel)) + 1;
int nLevelSlice = int((nSlice - 1) / pow(2, nLevel)) + 1;
int nLevelSize = nLevelWidth * nLevelHeight * nLevelSlice;
int nDataSizeByte = nLevelSize * sizeof(float);
float* d_ImageDataRef_tmp;
float* d_ImageDataMov_tmp;
checkCudaErrors(cudaMalloc((void**)&d_ImageDataRef_tmp, nDataSizeByte));
checkCudaErrors(cudaMalloc((void**)&d_ImageDataMov_tmp, nDataSizeByte));
checkCudaErrors(cudaMemset(d_ImageDataRef_tmp, 0, nDataSizeByte));
checkCudaErrors(cudaMemset(d_ImageDataMov_tmp, 0, nDataSizeByte));
int nHistBinsNum = 150;
thrust::device_vector<unsigned int> bucket_begin(nHistBinsNum * nHistBinsNum);
thrust::device_vector<unsigned int> bucket_end(nHistBinsNum * nHistBinsNum);
thrust::device_vector<unsigned int> bucket_indices(nLevelSize);
thrust::device_vector<unsigned int> bucket_sizes(nHistBinsNum * nHistBinsNum);
float2 *d_points;
cudaMalloc((void**)&d_points, sizeof(float2)* nLevelSize);
int nBlocksNumX = 1024;
int nThreadsNumPerBlock = 256;
dim3 nblocks;
nblocks.x = nBlocksNumX;
nblocks.y = ((1 + (nLevelSize - 1) / nThreadsNumPerBlock) - 1) / nBlocksNumX + 1;
for (int nIter = 0; nIter < nMaxIterNum; nIter++)
{
TransToFloat2 << <nblocks, nThreadsNumPerBlock >> >(d_ImageDataRef_tmp, d_ImageDataMov_tmp, d_points, nLevelSize, nBlocksNumX);
thrust::device_ptr<float2> points_t(d_points);
thrust::transform(points_t, points_t + nLevelSize, bucket_indices.begin(), point_to_bucket_index(nHistBinsNum, nHistBinsNum));
cout << "debug Iter : " << nIter << endl;
thrust::sort(bucket_indices.begin(), bucket_indices.end());
cout << "debug Iter : " << nIter << endl;
}
cudaFree(d_points);
cudaFree(d_ImageDataMov_tmp);
cudaFree(d_ImageDataMov_tmp);
}
int test_CUDA_error()
{
int nDevice = 0;
cudaSetDevice(nDevice);
int nDeviceCount;
cudaDeviceProp cDeviceProp;
cudaGetDeviceCount(&nDeviceCount);
cudaGetDeviceProperties(&cDeviceProp, nDevice);
if (1)
{
cout << "Using device # " << nDevice << endl;
cout << "Max threads per block: " << cDeviceProp.maxThreadsPerBlock << endl;
cout << "Max Threads DIM: " << cDeviceProp.maxThreadsDim[0] << " x " << cDeviceProp.maxThreadsDim[1] << " x " << cDeviceProp.maxThreadsDim[2] << endl;
cout << "Max Grid Size: " << cDeviceProp.maxGridSize[0] << " x " << cDeviceProp.maxGridSize[1] << " x " << cDeviceProp.maxGridSize[2] << endl;
printf("Device %d: \"%s\" with Compute %d.%d capability\n", nDevice, cDeviceProp.name, cDeviceProp.major, cDeviceProp.minor);
}
for (int nLevel = 1; nLevel >= 0; nLevel--)
{
MiniLevelCompute(nLevel);
}
return 0;
}