GPU version silhouette calculation is equal or even slower then CPU version, anything wrong?

Hello,

I’m trying to implement a GPU version silhouette score, but the performance is far from what I expected (1080ti takes 90 seconds to finish a very small dataset). New to CUDA, could you guys help me with some hints, how should I optimize the code? Thanks a lot.

I implement a naive approach (method 1), then tried to optimize it (method 2). Turns out it’s even slower T_T…

#include <algorithm>
#include <cfloat>
#include <chrono>
#include <fstream>
#include <iostream>
#include <random>
#include <sstream>
#include <vector>
#include <math.h>

#include <thrust/device_vector.h>
#include <thrust/host_vector.h>

#include "helper_cuda.h"
#include <vector>

using namespace std;
using namespace thrust;

__device__ float
distance(int d, float* point_a, float* point_b) {
  float dist = 0;
  for (int i=0; i<d; ++i)
    dist += pow((point_a[i] - point_b[i]), 2);
  return sqrt(dist);
}

constexpr int KMAX=128;

__global__ void
get_score_sum1(const int n, const int k, const int d, 
                const thrust::device_ptr<float> data,
                const thrust::device_ptr<int> labels, 
                thrust::device_ptr<float> sum) {
    
    float accum[KMAX];
    float count[KMAX];

    for (int i=0; i<k; ++i){
        accum[i]=0;
        count[i]=0;
    }

    int data_index = threadIdx.x + blockIdx.x * blockDim.x;
    
    //----------------------------------------------------
    //compute ai,bi
    int label_i = labels[data_index];
    for (size_t j = 0;j < n;++j) {
        int label_j = labels[j];
        if (data_index != j){
            accum[label_j] += distance(d, 
                                       raw_pointer_cast(data + data_index * d),
                                       raw_pointer_cast(data + j * d));
            count[label_j] += 1;
        }
    }


    float ai = count[label_i]>0 ? accum[label_i] / count[label_i] : 0;
    float bi = FLT_MAX;

    for (size_t j = 0;j < k;j++) {
        float m = count[j]>0 ? accum[j] / count[j] : 0;
        if (j == label_i) continue;
        if (m < bi)
            bi = m;
    }

    //----------------------------------------------------
    float s = (bi - ai) / fmax(ai, bi);
    s = isnan(s) ? 0 : s;
    atomicAdd(thrust::raw_pointer_cast(sum), s);
}

__global__ void
get_accum_distances(const int n, const int k, const int d,
              const thrust::device_ptr<float> data,
              const thrust::device_ptr<int> labels, 
              thrust::device_ptr<float> accum,
              thrust::device_ptr<int> count) 
{   
    // int i = threadIdx.x + blockIdx.x * blockDim.x;
    // int j = threadIdx.y + blockIdx.y * blockDim.y;

    __shared__ float data_i[512];
	__shared__ float data_j[512];
    __shared__ int label_j[32];

    int data_i_index = threadIdx.x + blockIdx.x * blockDim.x;
    if ((data_i_index < n) && (threadIdx.y == 0)) {
        for (int i=0; i<d; ++i)
            data_i[threadIdx.x + i] = data[data_i_index * d + i];
    }
    
    int data_j_index = threadIdx.x + blockIdx.y * blockDim.y;
    if ((data_j_index < n) && (threadIdx.y == 1)) {
        for (int i=0; i<d; ++i)
            data_j[threadIdx.x + i] = data[data_j_index * d + i];
    }

    if ((data_j_index < n) && (threadIdx.y == 2)) {
        label_j[threadIdx.x] = labels[data_j_index];
    }

    __syncthreads();

    data_j_index = threadIdx.y + blockIdx.y * blockDim.y;

    if ((data_i_index<n) && (data_j_index<n)){
        int lj = label_j[threadIdx.y]; 
        float dist = distance(d, 
                                data_i + threadIdx.x * d,
                                data_j + threadIdx.y * d);
            
        atomicAdd(thrust::raw_pointer_cast(accum + data_i_index * k + lj), dist);
        atomicAdd(thrust::raw_pointer_cast(count + data_i_index * k + lj), 1);
    }

}


__global__ void
get_score_sum2(const int n, const int k, const int d, 
                const thrust::device_ptr<int> labels,
                const thrust::device_ptr<float> accum,
                const thrust::device_ptr<int> count, 
                thrust::device_ptr<float> sum) {
      
    int data_index = threadIdx.x + blockIdx.x * blockDim.x;
    
    if (data_index >= n) return;
    
    int label_i = labels[data_index];
    float ai = count[label_i]>0 ? accum[label_i] / count[label_i] : 0;
    float bi = FLT_MAX;

    for (size_t j = 0;j < k;j++) {
        float m = count[j]>0 ? accum[j] / count[j] : 0;
        if (j == label_i) continue;
        if (m < bi)
            bi = m;
    }

    //----------------------------------------------------
    float s = (bi - ai) / fmax(ai, bi);
    s = isnan(s) ? 0 : s;
    atomicAdd(thrust::raw_pointer_cast(sum), s);
}


int main() {
  
    int n = 1e5;
    int d = 16;
    int k = 16;
  
    thrust::device_vector<float> d_data(n * d);
    random_data(d_data, n, d);

    thrust::device_vector<int> d_label(k);
    random_labels(d_label, n, k);
  
    thrust::device_vector<float> d_sum(1, 0);

    thrust::device_vector<float> d_accum(n * k, 0);
    thrust::device_vector<int> d_count(n * k, 0);
  
    const int threads = 32;
    const int blocks = (n + threads - 1) / threads;
  
    auto start = std::chrono::high_resolution_clock::now();
  
    // Method 2 #######################################################
    get_accum_distances<<<dim3(blocks,blocks), dim3(threads,threads)>>>(n, k, d,
                                    d_data.data(),
                                    d_label.data(),
                                    d_accum.data(),
                                    d_count.data());

    get_score_sum2<<<blocks, threads>>>(n, k, d,
                                        d_label.data(),
                                        d_accum.data(),
                                        d_count.data(),
                                        d_sum.data());

    // float sum = d_sum[0];
    // float score = sum/n;

    cudaDeviceSynchronize();

    auto end = std::chrono::high_resolution_clock::now();
    auto duration =
        std::chrono::duration_cast<std::chrono::duration<float>>(end - start);
    std::cerr << "Took: " << duration.count() << "s" << std::endl;

    start = std::chrono::high_resolution_clock::now();
    // Method 1 #############################################################

    get_score_sum1<<<blocks, threads>>>(n, k, d,
        d_data.data(),
        d_label.data(),
        d_sum.data());

    // float sum = d_sum[0];
    // float score = sum/n;

    cudaDeviceSynchronize();
    
    end = std::chrono::high_resolution_clock::now();
    duration =
        std::chrono::duration_cast<std::chrono::duration<float>>(end - start);
    std::cerr << "Took: " << duration.count() << "s" << std::endl;
    // cout << "Score: " << score <<endl;
  }

What is the code for the cpu implementation? Do your gpu codes produce the correct results? The kernel get_accum_distances looks wrong to me.

int data_j_index = threadIdx.x + blockIdx.y * blockDim.y;
    if ((data_j_index < n) && (threadIdx.y == 1)) {
        for (int i=0; i<d; ++i)
            data_j[threadIdx.x + i] = data[data_j_index * d + i];
    }

Let’s say d=2 and blockDim.x is 32. In the first iteration i = 0 your write to positions data_j [0 to 31]. In the next iteration i = 1 you write to positions [1 to 32], which overwrites the data stored in the previous iteration and introduces a race condition.