Help! Cuda kernel become super slow after adding a "+" operation

Trying to write a cuda kernel to calculate Silhouette score to evaluate clustering performance.
The following snippet is the minimum code to show my problem. The code runs fast, but after add accum[label_j] += dis; in line 59, it becomes really really slow. without line 59 it can finish within 2e-6 s, with line 59 it takes 90s. What’s wrong? Thanks for help =)

#include <algorithm>
#include <cfloat>
#include <chrono>
#include <fstream>
#include <iostream>
#include <random>
#include <sstream>
#include <vector>
#include <math.h>

#include <thrust/device_vector.h>
#include <thrust/host_vector.h>

#include "helper_cuda.h"
#include <vector>

using namespace std;
using namespace thrust;

__device__ float
distance(int d, float* point_a, float* point_b) {
  float dist = 0;
  for (int i=0; i<d; ++i)
    dist += pow((point_a[i] - point_b[i]), 2);
  return sqrt(dist);

constexpr int KMAX=128;

__global__ void
get_score_sum(const int n, const int k, const int d, 
                float* data,
                int* labels, 
                float* sum) 
    int data_index = threadIdx.x + blockIdx.x * blockDim.x;

    if (data_index>=n)
    float accum[KMAX];

    for (int i=0; i<k; ++i){
    //compute ai,bi
    int label_i = labels[data_index];
    for (size_t j = 0;j < n;++j) {
        int label_j = labels[j];
        if (data_index != j){
            float dis = distance(d, 
                raw_pointer_cast(data + data_index * d),
                raw_pointer_cast(data + j * d));

            accum[label_j] += dis;

__global__ void
test(const int n, const thrust::device_ptr<float> data){
    for (int i=0; i<n; ++i)
        printf("test: %f", *raw_pointer_cast(data+i));

int main() {
    int n = 1e5;
    int d = 16;
    int k = 16;
    thrust::device_vector<float> d_data(n * d);
    random_data(d_data, n, d);

    thrust::device_vector<int> d_label(k);
    random_labels(d_label, n, k);
    thrust::device_vector<float> d_sum(1);
    d_sum[0] = 0;
    const int threads = 1024;
    const int blocks = (n + threads - 1) / threads;
    const auto start = std::chrono::high_resolution_clock::now();
    get_score_sum<<<blocks, threads>>>(n, k, d,

    cudaError_t c_ret = cudaGetLastError();
    if (c_ret) {
        std::cout << "Error: " << cudaGetErrorString(c_ret) << "-->";
    const auto end = std::chrono::high_resolution_clock::now();
    const auto duration =
        std::chrono::duration_cast<std::chrono::duration<float>>(end - start);
    std::cerr << "Took: " << duration.count() << "s" << std::endl;

From a cursory look at the posted code, without this line, it seems to do no, or very nearly no, externally observable work. The compiler is very good at dead code elimination. All operations that do not affect externally visible state (i.e. that do not ultimately contribute to data written to global memory) are removed.

A single null kernel that does nothing should take about 5 microseconds to launch, so I am not sure how the measured elapsed time of 2 microseconds came about.

1 Like

Thanks a lot for your explanation =)