set max_depth = 3, I add printf in cdpQuadtree(), print last level node data ranage info
////////////////////////////////////////////////////////////////////////////////
// Allocate GPU structs, launch kernel and clean up
////////////////////////////////////////////////////////////////////////////////
bool cdpQuadtree(int warp_size)
{
// Constants to control the algorithm.
const int num_points = 1024;
const int max_depth = 3;
const int min_points_per_node = 16;
// Allocate memory for points.
thrust::device_vector<float> x_d0(num_points);
thrust::device_vector<float> x_d1(num_points);
thrust::device_vector<float> y_d0(num_points);
thrust::device_vector<float> y_d1(num_points);
// Generate random points.
Random_generator rnd;
thrust::generate(
thrust::make_zip_iterator(thrust::make_tuple(x_d0.begin(), y_d0.begin())),
thrust::make_zip_iterator(thrust::make_tuple(x_d0.end(), y_d0.end())),
rnd);
// Host structures to analyze the device ones.
Points points_init[2];
points_init[0].set(thrust::raw_pointer_cast(&x_d0[0]), thrust::raw_pointer_cast(&y_d0[0]));
points_init[1].set(thrust::raw_pointer_cast(&x_d1[0]), thrust::raw_pointer_cast(&y_d1[0]));
// Allocate memory to store points.
Points *points;
checkCudaErrors(cudaMalloc((void **) &points, 2*sizeof(Points)));
checkCudaErrors(cudaMemcpy(points, points_init, 2*sizeof(Points), cudaMemcpyHostToDevice));
// We could use a close form...
int max_nodes = 0;
for (int i = 0, num_nodes_at_level = 1 ; i < max_depth ; ++i, num_nodes_at_level *= 4)
max_nodes += num_nodes_at_level;
// Allocate memory to store the tree.
Quadtree_node root;
root.set_range(0, num_points);
Quadtree_node *nodes;
checkCudaErrors(cudaMalloc((void **) &nodes, max_nodes*sizeof(Quadtree_node)));
checkCudaErrors(cudaMemcpy(nodes, &root, sizeof(Quadtree_node), cudaMemcpyHostToDevice));
// We set the recursion limit for CDP to max_depth.
cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, max_depth);
// Build the quadtree.
Parameters params(max_depth, min_points_per_node);
std::cout << "Launching CDP kernel to build the quadtree" << std::endl;
const int NUM_THREADS_PER_BLOCK = 128; // Do not use less than 128 threads.
const int NUM_WARPS_PER_BLOCK = NUM_THREADS_PER_BLOCK / warp_size;
const size_t smem_size = 4*NUM_WARPS_PER_BLOCK*sizeof(int);
build_quadtree_kernel<NUM_THREADS_PER_BLOCK><<<1, NUM_THREADS_PER_BLOCK, smem_size>>>(nodes, points, params);
checkCudaErrors(cudaGetLastError());
// Copy points to CPU.
thrust::host_vector<float> x_h(x_d0);
thrust::host_vector<float> y_h(y_d0);
Points host_points;
host_points.set(thrust::raw_pointer_cast(&x_h[0]), thrust::raw_pointer_cast(&y_h[0]));
// Copy nodes to CPU.
Quadtree_node *host_nodes = new Quadtree_node[max_nodes];
checkCudaErrors(cudaMemcpy(host_nodes, nodes, max_nodes *sizeof(Quadtree_node), cudaMemcpyDeviceToHost));
// Validate the results.
bool ok = check_quadtree(host_nodes, 0, num_points, &host_points, params);
std::cout << "Results: " << (ok ? "OK" : "FAILED") << std::endl;
//=================================================================================
printf("max_nodes: %d\n", max_nodes);
int leaf_node_count = pow(4, max_depth - 1);
printf("leaf_node_count: %d\n", leaf_node_count);
Quadtree_node* node_ptr = host_nodes;
node_ptr = node_ptr + (max_nodes - leaf_node_count);
Quadtree_node* node_end_ptr = host_nodes + max_nodes - 1;
for (;;) {
if (node_ptr != node_end_ptr + 1) {
/*printf("coor: %7f, %7f___%7f, %7f\n", node_ptr->bounding_box().get_min().x, node_ptr->bounding_box().get_min().y,
node_ptr->bounding_box().get_max().x, node_ptr->bounding_box().get_max().y);*/
printf("coor: %7d, %7d___\n", node_ptr->points_begin(), node_ptr->points_end());
node_ptr++;
}
else {
break;
}
}
//================================================================================
// Free CPU memory.
delete[] host_nodes;
// Free memory.
checkCudaErrors(cudaFree(nodes));
checkCudaErrors(cudaFree(points));
return ok;
}