Hello,
I’m trying to implement a GPU version silhouette score, but the performance is far from what I expected (1080ti takes 90 seconds to finish a very small dataset). New to CUDA, could you guys help me with some hints, how should I optimize the code? Thanks a lot.
I implement a naive approach (method 1), then tried to optimize it (method 2). Turns out it’s even slower T_T…
#include <algorithm>
#include <cfloat>
#include <chrono>
#include <fstream>
#include <iostream>
#include <random>
#include <sstream>
#include <vector>
#include <math.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include "helper_cuda.h"
#include <vector>
using namespace std;
using namespace thrust;
__device__ float
distance(int d, float* point_a, float* point_b) {
float dist = 0;
for (int i=0; i<d; ++i)
dist += pow((point_a[i] - point_b[i]), 2);
return sqrt(dist);
}
constexpr int KMAX=128;
__global__ void
get_score_sum1(const int n, const int k, const int d,
const thrust::device_ptr<float> data,
const thrust::device_ptr<int> labels,
thrust::device_ptr<float> sum) {
float accum[KMAX];
float count[KMAX];
for (int i=0; i<k; ++i){
accum[i]=0;
count[i]=0;
}
int data_index = threadIdx.x + blockIdx.x * blockDim.x;
//----------------------------------------------------
//compute ai,bi
int label_i = labels[data_index];
for (size_t j = 0;j < n;++j) {
int label_j = labels[j];
if (data_index != j){
accum[label_j] += distance(d,
raw_pointer_cast(data + data_index * d),
raw_pointer_cast(data + j * d));
count[label_j] += 1;
}
}
float ai = count[label_i]>0 ? accum[label_i] / count[label_i] : 0;
float bi = FLT_MAX;
for (size_t j = 0;j < k;j++) {
float m = count[j]>0 ? accum[j] / count[j] : 0;
if (j == label_i) continue;
if (m < bi)
bi = m;
}
//----------------------------------------------------
float s = (bi - ai) / fmax(ai, bi);
s = isnan(s) ? 0 : s;
atomicAdd(thrust::raw_pointer_cast(sum), s);
}
__global__ void
get_accum_distances(const int n, const int k, const int d,
const thrust::device_ptr<float> data,
const thrust::device_ptr<int> labels,
thrust::device_ptr<float> accum,
thrust::device_ptr<int> count)
{
// int i = threadIdx.x + blockIdx.x * blockDim.x;
// int j = threadIdx.y + blockIdx.y * blockDim.y;
__shared__ float data_i[512];
__shared__ float data_j[512];
__shared__ int label_j[32];
int data_i_index = threadIdx.x + blockIdx.x * blockDim.x;
if ((data_i_index < n) && (threadIdx.y == 0)) {
for (int i=0; i<d; ++i)
data_i[threadIdx.x + i] = data[data_i_index * d + i];
}
int data_j_index = threadIdx.x + blockIdx.y * blockDim.y;
if ((data_j_index < n) && (threadIdx.y == 1)) {
for (int i=0; i<d; ++i)
data_j[threadIdx.x + i] = data[data_j_index * d + i];
}
if ((data_j_index < n) && (threadIdx.y == 2)) {
label_j[threadIdx.x] = labels[data_j_index];
}
__syncthreads();
data_j_index = threadIdx.y + blockIdx.y * blockDim.y;
if ((data_i_index<n) && (data_j_index<n)){
int lj = label_j[threadIdx.y];
float dist = distance(d,
data_i + threadIdx.x * d,
data_j + threadIdx.y * d);
atomicAdd(thrust::raw_pointer_cast(accum + data_i_index * k + lj), dist);
atomicAdd(thrust::raw_pointer_cast(count + data_i_index * k + lj), 1);
}
}
__global__ void
get_score_sum2(const int n, const int k, const int d,
const thrust::device_ptr<int> labels,
const thrust::device_ptr<float> accum,
const thrust::device_ptr<int> count,
thrust::device_ptr<float> sum) {
int data_index = threadIdx.x + blockIdx.x * blockDim.x;
if (data_index >= n) return;
int label_i = labels[data_index];
float ai = count[label_i]>0 ? accum[label_i] / count[label_i] : 0;
float bi = FLT_MAX;
for (size_t j = 0;j < k;j++) {
float m = count[j]>0 ? accum[j] / count[j] : 0;
if (j == label_i) continue;
if (m < bi)
bi = m;
}
//----------------------------------------------------
float s = (bi - ai) / fmax(ai, bi);
s = isnan(s) ? 0 : s;
atomicAdd(thrust::raw_pointer_cast(sum), s);
}
int main() {
int n = 1e5;
int d = 16;
int k = 16;
thrust::device_vector<float> d_data(n * d);
random_data(d_data, n, d);
thrust::device_vector<int> d_label(k);
random_labels(d_label, n, k);
thrust::device_vector<float> d_sum(1, 0);
thrust::device_vector<float> d_accum(n * k, 0);
thrust::device_vector<int> d_count(n * k, 0);
const int threads = 32;
const int blocks = (n + threads - 1) / threads;
auto start = std::chrono::high_resolution_clock::now();
// Method 2 #######################################################
get_accum_distances<<<dim3(blocks,blocks), dim3(threads,threads)>>>(n, k, d,
d_data.data(),
d_label.data(),
d_accum.data(),
d_count.data());
get_score_sum2<<<blocks, threads>>>(n, k, d,
d_label.data(),
d_accum.data(),
d_count.data(),
d_sum.data());
// float sum = d_sum[0];
// float score = sum/n;
cudaDeviceSynchronize();
auto end = std::chrono::high_resolution_clock::now();
auto duration =
std::chrono::duration_cast<std::chrono::duration<float>>(end - start);
std::cerr << "Took: " << duration.count() << "s" << std::endl;
start = std::chrono::high_resolution_clock::now();
// Method 1 #############################################################
get_score_sum1<<<blocks, threads>>>(n, k, d,
d_data.data(),
d_label.data(),
d_sum.data());
// float sum = d_sum[0];
// float score = sum/n;
cudaDeviceSynchronize();
end = std::chrono::high_resolution_clock::now();
duration =
std::chrono::duration_cast<std::chrono::duration<float>>(end - start);
std::cerr << "Took: " << duration.count() << "s" << std::endl;
// cout << "Score: " << score <<endl;
}