Trying to write a cuda kernel to calculate Silhouette score to evaluate clustering performance.
The following snippet is the minimum code to show my problem. The code runs fast, but after add accum[label_j] += dis; in line 59, it becomes really really slow. without line 59 it can finish within 2e-6 s, with line 59 it takes 90s. What’s wrong? Thanks for help =)
#include <algorithm>
#include <cfloat>
#include <chrono>
#include <fstream>
#include <iostream>
#include <random>
#include <sstream>
#include <vector>
#include <math.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include "helper_cuda.h"
#include <vector>
using namespace std;
using namespace thrust;
__device__ float
distance(int d, float* point_a, float* point_b) {
float dist = 0;
for (int i=0; i<d; ++i)
dist += pow((point_a[i] - point_b[i]), 2);
return sqrt(dist);
}
constexpr int KMAX=128;
__global__ void
get_score_sum(const int n, const int k, const int d,
float* data,
int* labels,
float* sum)
{
int data_index = threadIdx.x + blockIdx.x * blockDim.x;
if (data_index>=n)
return;
float accum[KMAX];
for (int i=0; i<k; ++i){
accum[k]=0;
}
//----------------------------------------------------
//compute ai,bi
int label_i = labels[data_index];
for (size_t j = 0;j < n;++j) {
int label_j = labels[j];
if (data_index != j){
float dis = distance(d,
raw_pointer_cast(data + data_index * d),
raw_pointer_cast(data + j * d));
accum[label_j] += dis;
}
}
}
__global__ void
test(const int n, const thrust::device_ptr<float> data){
for (int i=0; i<n; ++i)
printf("test: %f", *raw_pointer_cast(data+i));
}
int main() {
int n = 1e5;
int d = 16;
int k = 16;
thrust::device_vector<float> d_data(n * d);
random_data(d_data, n, d);
thrust::device_vector<int> d_label(k);
random_labels(d_label, n, k);
thrust::device_vector<float> d_sum(1);
d_sum[0] = 0;
const int threads = 1024;
const int blocks = (n + threads - 1) / threads;
const auto start = std::chrono::high_resolution_clock::now();
get_score_sum<<<blocks, threads>>>(n, k, d,
raw_pointer_cast(d_data.data()),
raw_pointer_cast(d_label.data()),
raw_pointer_cast(d_sum.data()));
cudaError_t c_ret = cudaGetLastError();
if (c_ret) {
std::cout << "Error: " << cudaGetErrorString(c_ret) << "-->";
}
cudaDeviceSynchronize();
const auto end = std::chrono::high_resolution_clock::now();
const auto duration =
std::chrono::duration_cast<std::chrono::duration<float>>(end - start);
std::cerr << "Took: " << duration.count() << "s" << std::endl;
}