Hello,
Below is my implementation of Welford’s mean and variance calculation algorithm (leaving out error checking). It seems to work correctly, but it is terribly slow (>500 microseconds on my RTX 3070). Am I missing something? I’d really appreciate any corrections/hints/ideas. Thank you very much.
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
struct WelfordData {
double mean;
double M2;
long long count;
};
__device__ void welford_update(WelfordData& wd, double x) {
wd.count++;
double delta = x - wd.mean;
wd.mean += delta / wd.count;
wd.M2 += delta * (x - wd.mean);
}
__global__ void welford_kernel(const float* data, int n, WelfordData* block_results) {
extern __shared__ WelfordData shared_wd[];
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int lane = threadIdx.x;
WelfordData local_wd = {0.0, 0.0, 0};
while (tid < n) {
welford_update(local_wd, static_cast<double>(data[tid]));
tid += blockDim.x * gridDim.x;
}
shared_wd[lane] = local_wd;
__syncthreads();
for (int s = blockDim.x / 2; s > 0; s >>= 1) {
if (lane < s) {
WelfordData& a = shared_wd[lane];
WelfordData& b = shared_wd[lane + s];
if (b.count > 0) {
double delta = b.mean - a.mean;
long long combined_count = a.count + b.count;
a.M2 += b.M2 + delta * delta * a.count * b.count / combined_count;
a.mean += delta * b.count / combined_count;
a.count = combined_count;
}
}
__syncthreads();
}
if (lane == 0) {
block_results[blockIdx.x] = shared_wd[0];
}
}
__global__ void reduce_welford(WelfordData* block_results, int num_blocks, double* mean, double* var) {
__shared__ WelfordData shared_wd[256];
int tid = threadIdx.x;
WelfordData wd = {0.0, 0.0, 0};
for (int i = tid; i < num_blocks; i += blockDim.x) {
WelfordData b = block_results[i];
if (b.count == 0) continue;
double delta = b.mean - wd.mean;
long long combined_count = wd.count + b.count;
wd.M2 += b.M2 + delta * delta * wd.count * b.count / combined_count;
wd.mean += delta * b.count / combined_count;
wd.count = combined_count;
}
shared_wd[tid] = wd;
__syncthreads();
for (int s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) {
WelfordData& a = shared_wd[tid];
WelfordData& b = shared_wd[tid + s];
if (b.count > 0) {
double delta = b.mean - a.mean;
long long combined_count = a.count + b.count;
a.M2 += b.M2 + delta * delta * a.count * b.count / combined_count;
a.mean += delta * b.count / combined_count;
a.count = combined_count;
}
}
__syncthreads();
}
if (tid == 0) {
*mean = shared_wd[0].mean;
*var = (shared_wd[0].count > 1) ? shared_wd[0].M2 / shared_wd[0].count : 0;
// This computes biased variance (as opposed to the unbiased variance, which would divide by count - 1)
}
}
void compute_welford(const std::vector<float>& h_data, double& h_mean, double& h_var) {
int n = h_data.size();
if (n == 0) {
h_mean = 0;
h_var = 0;
return;
}
float *d_data;
double *d_mean, *d_var;
WelfordData *d_block_results;
cudaMalloc(&d_data, n * sizeof(float));
cudaMalloc(&d_mean, sizeof(double));
cudaMalloc(&d_var, sizeof(double));
int blockSize = 256; // must be a power of 2 for reduction to work correctly
int numBlocks = (n + blockSize - 1) / blockSize;
cudaMalloc(&d_block_results, numBlocks * sizeof(WelfordData));
cudaMemcpy(d_data, h_data.data(), n * sizeof(float), cudaMemcpyHostToDevice);
welford_kernel<<<numBlocks, blockSize, blockSize * sizeof(WelfordData)>>>(d_data, n, d_block_results);
reduce_welford<<<1, blockSize>>>(d_block_results, numBlocks, d_mean, d_var);
cudaMemcpy(&h_mean, d_mean, sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(&h_var, d_var, sizeof(double), cudaMemcpyDeviceToHost);
cudaFree(d_data);
cudaFree(d_mean);
cudaFree(d_var);
cudaFree(d_block_results);
}
int main() {
const int size = 1024 * 1280;
std::vector<float> data(size);
for (int i = 0; i < size; i++)
data[i] = static_cast<float>(i);
double mean, var;
compute_welford(data, mean, var);
std::cout << "Mean: " << mean << "\n";
std::cout << "Biased Variance: " << var << "\n";
return 0;
}