Welford's algorithm

Hello,
Below is my implementation of Welford’s mean and variance calculation algorithm (leaving out error checking). It seems to work correctly, but it is terribly slow (>500 microseconds on my RTX 3070). Am I missing something? I’d really appreciate any corrections/hints/ideas. Thank you very much.

#include <cuda_runtime.h>
#include <iostream>
#include <vector>

struct WelfordData {
    double mean;
    double M2;
    long long count;
};

__device__ void welford_update(WelfordData& wd, double x) {
    wd.count++;
    double delta = x - wd.mean;
    wd.mean += delta / wd.count;
    wd.M2 += delta * (x - wd.mean);
}

__global__ void welford_kernel(const float* data, int n, WelfordData* block_results) {
    extern __shared__ WelfordData shared_wd[];
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    int lane = threadIdx.x;
    
    WelfordData local_wd = {0.0, 0.0, 0};
    while (tid < n) {
        welford_update(local_wd, static_cast<double>(data[tid]));
        tid += blockDim.x * gridDim.x;
    }
    
    shared_wd[lane] = local_wd;
    __syncthreads();
    
    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (lane < s) {
            WelfordData& a = shared_wd[lane];
            WelfordData& b = shared_wd[lane + s];
            if (b.count > 0) {
                double delta = b.mean - a.mean;
                long long combined_count = a.count + b.count;
                a.M2 += b.M2 + delta * delta * a.count * b.count / combined_count;
                a.mean += delta * b.count / combined_count;
                a.count = combined_count;
            }
        }
        __syncthreads();
    }
    
    if (lane == 0) {
        block_results[blockIdx.x] = shared_wd[0];
    }
}

__global__ void reduce_welford(WelfordData* block_results, int num_blocks, double* mean, double* var) {
    __shared__ WelfordData shared_wd[256];
    int tid = threadIdx.x;
    WelfordData wd = {0.0, 0.0, 0};
    
    for (int i = tid; i < num_blocks; i += blockDim.x) {
        WelfordData b = block_results[i];
        if (b.count == 0) continue;
        double delta = b.mean - wd.mean;
        long long combined_count = wd.count + b.count;
        wd.M2 += b.M2 + delta * delta * wd.count * b.count / combined_count;
        wd.mean += delta * b.count / combined_count;
        wd.count = combined_count;
    }
    
    shared_wd[tid] = wd;
    __syncthreads();
    
    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s) {
            WelfordData& a = shared_wd[tid];
            WelfordData& b = shared_wd[tid + s];
            if (b.count > 0) {
                double delta = b.mean - a.mean;
                long long combined_count = a.count + b.count;
                a.M2 += b.M2 + delta * delta * a.count * b.count / combined_count;
                a.mean += delta * b.count / combined_count;
                a.count = combined_count;
            }
        }
        __syncthreads();
    }
    
    if (tid == 0) {
        *mean = shared_wd[0].mean;
        *var = (shared_wd[0].count > 1) ? shared_wd[0].M2 / shared_wd[0].count : 0;
        // This computes biased variance (as opposed to the unbiased variance, which would divide by count - 1)
    }
}

void compute_welford(const std::vector<float>& h_data, double& h_mean, double& h_var) {
    int n = h_data.size();
    if (n == 0) {
        h_mean = 0;
        h_var = 0;
        return;
    }
    
    float *d_data;
    double *d_mean, *d_var;
    WelfordData *d_block_results;
    
    cudaMalloc(&d_data, n * sizeof(float));
    cudaMalloc(&d_mean, sizeof(double));
    cudaMalloc(&d_var, sizeof(double));
    
    int blockSize = 256;  // must be a power of 2 for reduction to work correctly
    int numBlocks = (n + blockSize - 1) / blockSize;
    cudaMalloc(&d_block_results, numBlocks * sizeof(WelfordData));
    
    cudaMemcpy(d_data, h_data.data(), n * sizeof(float), cudaMemcpyHostToDevice);
    
    welford_kernel<<<numBlocks, blockSize, blockSize * sizeof(WelfordData)>>>(d_data, n, d_block_results);
    reduce_welford<<<1, blockSize>>>(d_block_results, numBlocks, d_mean, d_var);
    
    cudaMemcpy(&h_mean, d_mean, sizeof(double), cudaMemcpyDeviceToHost);
    cudaMemcpy(&h_var, d_var, sizeof(double), cudaMemcpyDeviceToHost);
    
    cudaFree(d_data);
    cudaFree(d_mean);
    cudaFree(d_var);
    cudaFree(d_block_results);
}

int main() {
    const int size = 1024 * 1280;
    std::vector<float> data(size);
    for (int i = 0; i < size; i++)
        data[i] = static_cast<float>(i);
    
    double mean, var;
    compute_welford(data, mean, var);
    
    std::cout << "Mean: " << mean << "\n";
    std::cout << "Biased Variance: " << var << "\n";
    
    return 0;
}

double is slow on non dataceter-GPUs.

I wasn’t able to get a required precision using floats :-(

In addition, I also tried to use warp-shuffle reduction, but it gives me slightly wrong result. May be someone can spot any problem in this implementation:

#include <cuda_runtime.h>
#include <iostream>
#include <vector>

struct WelfordData {
    double mean;
    double M2;
    long long count;
};

__device__ void welford_update(WelfordData& wd, double x) {
    wd.count++;
    double delta = x - wd.mean;
    wd.mean += delta / wd.count;
    wd.M2 += delta * (x - wd.mean);
}

__device__ WelfordData warp_reduce(WelfordData wd) {
    for (int offset = 16; offset > 0; offset >>= 1) {
        WelfordData other;
        other.mean = __shfl_down_sync(0xFFFFFFFF, wd.mean, offset);
        other.M2 = __shfl_down_sync(0xFFFFFFFF, wd.M2, offset);
        other.count = __shfl_down_sync(0xFFFFFFFF, wd.count, offset);
        
        if (other.count > 0) {
            double delta = other.mean - wd.mean;
            long long combined_count = wd.count + other.count;
            wd.mean += delta * other.count / combined_count;
            wd.M2 += other.M2 + delta * delta * wd.count * other.count / combined_count;
            wd.count = combined_count;
        }
    }
    return wd;
}

__global__ void welford_kernel(const float* data, int n, WelfordData* block_results) {
    __shared__ WelfordData shared_wd[32];  
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    int lane = threadIdx.x % 32;
    int warp_id = threadIdx.x / 32;

    WelfordData local_wd = {0.0, 0.0, 0};
    while (tid < n) {
        welford_update(local_wd, static_cast<double>(data[tid]));
        tid += blockDim.x * gridDim.x;
    }

    local_wd = warp_reduce(local_wd);

    if (lane == 0) {
        shared_wd[warp_id] = local_wd;
    }
    __syncthreads();

    if (warp_id == 0) {
        WelfordData final_wd = {0.0, 0.0, 0};
        if (lane < blockDim.x / 32) {
            final_wd = shared_wd[lane];
        }
        final_wd = warp_reduce(final_wd);
        if (lane == 0) {
            block_results[blockIdx.x] = final_wd;
        }
    }
}

__global__ void reduce_welford(WelfordData* block_results, int num_blocks, double* mean, double* var) {
    WelfordData wd = {0.0, 0.0, 0};

    for (int i = threadIdx.x; i < num_blocks; i += blockDim.x) {
        WelfordData b = block_results[i];
        if (b.count > 0) {
            double delta = b.mean - wd.mean;
            long long combined_count = wd.count + b.count;
            wd.mean += delta * b.count / combined_count;
            wd.M2 += b.M2 + delta * delta * wd.count * b.count / combined_count;
            wd.count = combined_count;
        }
    }

    wd = warp_reduce(wd);

    if (threadIdx.x == 0) {
        *mean = wd.mean;
        *var = (wd.count > 1) ? wd.M2 / wd.count : 0; // Use biased variance (n denominator)
    }
}

void compute_welford(const std::vector<float>& h_data, double& h_mean, double& h_var) {
    int n = h_data.size();
    if (n == 0) {
        h_mean = 0;
        h_var = 0;
        return;
    }

    float *d_data;
    double *d_mean, *d_var;
    WelfordData *d_block_results;

    cudaMalloc(&d_data, n * sizeof(float));
    cudaMalloc(&d_mean, sizeof(double));
    cudaMalloc(&d_var, sizeof(double));

    int blockSize = 256;
    int numBlocks = (n + blockSize - 1) / blockSize;
    cudaMalloc(&d_block_results, numBlocks * sizeof(WelfordData));
    cudaMemset(d_block_results, 0, numBlocks * sizeof(WelfordData));

    cudaMemcpy(d_data, h_data.data(), n * sizeof(float), cudaMemcpyHostToDevice);

    welford_kernel<<<numBlocks, blockSize>>>(d_data, n, d_block_results);
    reduce_welford<<<1, blockSize>>>(d_block_results, numBlocks, d_mean, d_var);

    cudaMemcpy(&h_mean, d_mean, sizeof(double), cudaMemcpyDeviceToHost);
    cudaMemcpy(&h_var, d_var, sizeof(double), cudaMemcpyDeviceToHost);

    cudaFree(d_data);
    cudaFree(d_mean);
    cudaFree(d_var);
    cudaFree(d_block_results);
}

int main() {
    const int size = 1024 * 1280;
    std::vector<float> data(size);
    for (int i = 0; i < size; i++) {
        data[i] = static_cast<float>(i);
    }
    
    double mean, var;
    compute_welford(data, mean, var);

    std::cout << "Mean: " << mean << "\n";
    std::cout << "Biased Variance: " << var << "\n";

    return 0;
}

One thing I noticed at least by a quick look is that you are doing A LOT of implicit conversions due to mixed-type operations. Every time you multiply a double with a long long the long long will get converted into a double first.

What performance difference do you get if you make count as double instead of long long?

Also the cache line in your struct is 64+64+64 = 192 so it will download 128 (one cache line) and a half filled cache line… ?

How large does wd.count get? If you have just a few numbers, you can use a lookup-table for the reciprocal, e.g. in shared memory. Divisions are slow.

Have you checked for bank conflicts and that no local memory is used?

The following transformation usually buys speed without negative impact on overall accuracy:

wd.mean = fma (1.0 / wd.count, delta, wd.mean);

On a processor architecture where all divisions are pure software implementations, reciprocals can be computed more cheaply than divisions. While this adds an additional rounding error (one from the reciprocal, another one from multiplication), the use of FMA then eliminates a rounding error, so overall it tends to be a wash accuracy-wise.

To be clear, this is a micro optimization. Use the profiler to guide your optimization efforts.

Thank everybody very much! Finally, it seems to me that the Welford’s algorithm is not easily parallelizable, though its numerical stability is attractive. So I gave up and ended with a standard sum-sumsquares-reduction algorithm for mean and variance calculations. It is in a order of magnitude faster. Thank you once again!

2 Likes