I have tried to remove unrelated items, but still a bit messy.
#include <iostream>
#include <fstream>
#include <cuda_runtime.h>
#define PROPAGATE_CUDA_ERROR(x) \
{ \
err = x; \
if (err != cudaSuccess) \
{ \
std::cerr << "CUDA Error [" << __FILE__ << ":" << __LINE__ << "]: " << cudaGetErrorString(err) << std::endl; \
return err; \
} \
}
cudaError_t precompute(
const uint *h_points,
unsigned long long len,
uint *&h_points_precompute,
cudaStream_t stream = 0
) {
cudaError_t err;
PROPAGATE_CUDA_ERROR(cudaHostAlloc(&h_points_precompute, 64 * len * 12, cudaHostAllocDefault));
cudaEvent_t begin_precompute;
PROPAGATE_CUDA_ERROR(cudaEventCreate(&begin_precompute));
PROPAGATE_CUDA_ERROR(cudaEventRecord(begin_precompute, stream));
cudaStream_t child_stream[2];
PROPAGATE_CUDA_ERROR(cudaStreamCreate(&child_stream[0]));
PROPAGATE_CUDA_ERROR(cudaStreamWaitEvent(child_stream[0], begin_precompute, cudaEventWaitDefault));
uint *points[2];
PROPAGATE_CUDA_ERROR(cudaMallocAsync(&points[0], 64 * len * 12, child_stream[0]));
uint stage = 0;
unsigned long long cur_len = std::min(len, len - 0);
PROPAGATE_CUDA_ERROR(cudaMemcpyAsync(points[stage], h_points, 64 * len, cudaMemcpyHostToDevice, child_stream[stage]));
PROPAGATE_CUDA_ERROR(cudaMemcpyAsync(h_points_precompute, points[stage], 64 * len * 12, cudaMemcpyDeviceToHost, child_stream[stage]));
stage ^= 1;
PROPAGATE_CUDA_ERROR(cudaFreeAsync(points[stage^1], child_stream[stage^1]));
cudaEvent_t end_precompute[2];
PROPAGATE_CUDA_ERROR(cudaEventCreate(&end_precompute[0]));
PROPAGATE_CUDA_ERROR(cudaEventRecord(end_precompute[0], child_stream[0]));
PROPAGATE_CUDA_ERROR(cudaStreamWaitEvent(stream, end_precompute[0], cudaEventWaitDefault));
return cudaSuccess;
}
__host__ cudaError_t run(
const unsigned long long len,
const uint *h_scalers,
const uint *h_points_precompute,
cudaStream_t stream = 0
) {
cudaError_t err;
uint *buckets_sum_buf;
PROPAGATE_CUDA_ERROR(cudaMallocAsync(&buckets_sum_buf, 128ull * 1 * ((1 << 21)), stream));
unsigned short *mutex_buf;
PROPAGATE_CUDA_ERROR(cudaMallocAsync(&mutex_buf, sizeof(unsigned short) * ((1 << 21)) * 1, stream));
PROPAGATE_CUDA_ERROR(cudaMemsetAsync(mutex_buf, 0, sizeof(unsigned short) * ((1 << 21)) * 1, stream));
unsigned short *initialized_buf;
PROPAGATE_CUDA_ERROR(cudaMallocAsync(&initialized_buf, sizeof(unsigned short) * ((1 << 21)) * 1, stream));
PROPAGATE_CUDA_ERROR(cudaMemsetAsync(initialized_buf, 0, sizeof(unsigned short) * ((1 << 21)) * 1, stream));
uint *cnt_zero;
PROPAGATE_CUDA_ERROR(cudaMallocAsync(&cnt_zero, sizeof(uint), stream));
unsigned long long *indexs;
PROPAGATE_CUDA_ERROR(cudaMallocAsync(&indexs, sizeof(unsigned long long) * 12 * len * 2, stream));
uint *scalers;
PROPAGATE_CUDA_ERROR(cudaMallocAsync(&scalers, sizeof(uint) * 8 * len, stream));
uint* points;
PROPAGATE_CUDA_ERROR(cudaMallocAsync(&points, 64 * len * 12, stream));
return cudaSuccess;
}
int main(int argc, char *argv[])
{
auto len = 1 << 24;
uint* scalers = new uint[len * 8];
uint* points = new uint[len * 64];
cudaHostRegister((void*)scalers, len * sizeof(uint) * 8, cudaHostRegisterDefault);
cudaHostRegister((void*)points, len * sizeof(uint) * 16, cudaHostRegisterDefault);
uint *h_points_precompute, head;
uint * h_points = (uint*)malloc(64 * len * sizeof(uint));
cudaStream_t stream;
cudaStreamCreate(&stream);
precompute((uint*)points, len, h_points_precompute, stream);
// cudaDeviceSynchronize();
run(len, (uint*)scalers, h_points_precompute, stream);
cudaDeviceSynchronize();
cudaHostUnregister((void*)scalers);
cudaHostUnregister((void*)points);
cudaFreeHost(h_points_precompute);
return 0;
}
This code is run on a RTX4090 with driver 550.107.02, cuda 12.3, with/without the cudaDeviceSynchronize()
in line 87will result in success/oom.