I have an array with many points, these points belong to different objects, the data of the array is arranged in the order [x,x,x,x,x…y,y,y,y,y… .z,z,z,z,z]. In addition, I know the index of points belong to which object. now I want to calculate the center of gravity of each object. That is, the average value of the x, y, and z coordinates of all points corresponding to this object.
The attached code is written by myself, but the speed is relatively slow. Can anyone give me some suggestions for optimizing efficiency?
#include <iostream>
#include <stdio.h>
#include <math.h>
#include <stddef.h>
#include <Eigen/Dense>
#include <cuda.h>
#include <cuda_runtime.h>
#include <chrono>
static void HandleError(cudaError_t err, const char *file, int line)
{
if (err != cudaSuccess)
{
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
// one thread for each object
__global__ void getMean(const float *matched_srcs, const int num_src_total, const int *index_srcs,
const int *step_srcs, const int num_objs, float *means)
{
size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid >= num_objs)
{
return;
}
int start, end;
if (tid == 0)
{
start = 0;
end = step_srcs[0];
}
else
{
start = step_srcs[tid - 1];
end = step_srcs[tid];
}
float src_x = 0.0;
float src_y = 0.0;
float src_z = 0.0;
int match_num = end - start;
for (int i = start; i < end; ++i)
{
src_x += matched_srcs[i];
src_y += matched_srcs[i + num_src_total];
src_z += matched_srcs[i + num_src_total * 2];
}
src_x = __fdividef(src_x, match_num);
src_y = __fdividef(src_y, match_num);
src_z = __fdividef(src_z, match_num);
means[tid * 3 + 0] = src_x;
means[tid * 3 + 1] = src_y;
means[tid * 3 + 2] = src_z;
}
int main(int argc, char *argv[])
{
// prepare cpu data
int num_objs = 50;
int num_src = 1000;
int num_src_total = num_objs * num_src;
std::vector<float> host_src_point(num_src_total * 3, 1);
std::vector<int> host_step_srcs, host_index_srcs;
for (int i = 0; i < num_objs; ++i)
{
host_step_srcs.push_back(num_src * (i + 1));
for (int j = 0; j < num_src; ++j)
{
host_index_srcs.push_back(i);
}
}
// prepare gpu data
float *source_points = nullptr;
cudaMalloc((void **)&source_points, sizeof(float) * num_src_total * 3);
cudaMemcpy(source_points, host_src_point.data(), sizeof(float) * num_src_total * 3, cudaMemcpyHostToDevice);
float *original_sources = nullptr;
cudaMalloc((void **)&original_sources, sizeof(float) * num_src_total * 3);
cudaMemcpy(original_sources, host_src_point.data(), sizeof(float) * num_src_total * 3, cudaMemcpyHostToDevice);
int *index_srcs = nullptr;
cudaMalloc((void **)&index_srcs, sizeof(int) * num_src_total);
cudaMemcpy(index_srcs, host_index_srcs.data(), sizeof(int) * num_src_total, cudaMemcpyHostToDevice);
int *step_srcs = nullptr;
cudaMalloc((void **)&step_srcs, sizeof(int) * num_objs);
cudaMemcpy(step_srcs, host_step_srcs.data(), sizeof(int) * num_objs, cudaMemcpyHostToDevice);
float *matched_srcs = nullptr;
cudaMalloc((void **)&matched_srcs, sizeof(float) * num_src_total * 3);
float max_dist = 10.0;
cudaStream_t stream;
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
float elapsed = 0;
cudaEvent_t start, stop;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventRecord(start, stream));
int *num_points = nullptr;
cudaMalloc((void **)&num_points, sizeof(int) * num_objs);
cudaMemset(num_points, 0, sizeof(int) * num_objs);
float *means = nullptr;
cudaMalloc((void **)&means, sizeof(float) * num_objs * 6);
int thre_per_block = 64;
int block = 1;
getMean<<<block, thre_per_block, 0, stream>>>(source_points,
num_src_total,
index_srcs,
step_srcs,
num_objs,
means);
cudaStreamSynchronize(stream);
HANDLE_ERROR(cudaEventRecord(stop, stream));
HANDLE_ERROR(cudaEventSynchronize(stop));
HANDLE_ERROR(cudaEventElapsedTime(&elapsed, start, stop));
HANDLE_ERROR(cudaEventDestroy(start));
HANDLE_ERROR(cudaEventDestroy(stop));
std::cout << "the gpu time cost is: " << elapsed << std::endl;
std::vector<float> ans_srcs(num_objs * 6, 0);
cudaMemcpy(ans_srcs.data(), means, sizeof(float) * num_objs * 6, cudaMemcpyDeviceToHost);
}