Hi,
I am familiar with the GPU architecture. However, I have barely programmed them using CUDA.
I would like to profile the energy consumption of single operations like addition, multiplication, sin (arithmetic and intrinsic), and also their latency.
For that I have kernels like:
__global__ void sin_intrinsic32_kernel(float* a, float* c, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
c[idx] = __sinf(a[idx]); // CUDA's intrinsic sine function
}
}
Such kernels are run from a helper function like this:
// Helper to measure kernel execution time and energy consumption for 2-argument kernels
template <typename T>
void run_and_measure(void (*kernel)(T*, T*, int), T* a, T* c, int N, int num_blocks, nvmlDevice_t device,
float& avg_time_per_block_ps, float& avg_energy_per_kernel_joules) {
const int threadsPerBlock = 256;
const int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
// Warm-up phase
kernel<<<blocksPerGrid, threadsPerBlock>>>(a, c, N);
cudaDeviceSynchronize();
// Create CUDA events to measure time
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Total time accumulator
float total_time_ms = 0;
unsigned long long total_energy = 0;
// Run the kernel 1K times and average
const int runs = 1000;
for (int i = 0; i < runs; ++i) {
// Measure energy before the kernel
unsigned long long energy_before = 0;
nvmlDeviceGetTotalEnergyConsumption(device, &energy_before);
// Launch the kernel
cudaEventRecord(start);
kernel<<<blocksPerGrid, threadsPerBlock>>>(a, c, N);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
// Measure energy after the kernel
unsigned long long energy_after = 0;
nvmlDeviceGetTotalEnergyConsumption(device, &energy_after);
total_energy += (energy_after - energy_before);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
total_time_ms += milliseconds;
cudaDeviceSynchronize(); // Ensure the kernel is done
}
// Average time per operation per block
avg_time_per_block_ps = (total_time_ms / runs) * 1e6 / num_blocks;
// Average energy per operation
avg_energy_per_kernel_joules = static_cast<float>(total_energy) / (runs);
// Destroy CUDA events
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
My main looks like this:
int main() {
const int N = 1 << 20; // 1M elements
const int threadsPerBlock = 256;
const int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
// Allocate memory for different types
int *a32, *b32, *c32;
short *a16, *b16, *c16;
char *a8, *b8, *c8;
float *af32, *bf32, *cf32;
// Allocate unified memory (accessible by both CPU and GPU)
cudaMallocManaged(&a32, N * sizeof(int));
cudaMallocManaged(&b32, N * sizeof(int));
cudaMallocManaged(&c32, N * sizeof(int));
cudaMallocManaged(&a16, N * sizeof(short));
cudaMallocManaged(&b16, N * sizeof(short));
cudaMallocManaged(&c16, N * sizeof(short));
cudaMallocManaged(&a8, N * sizeof(char));
cudaMallocManaged(&b8, N * sizeof(char));
cudaMallocManaged(&c8, N * sizeof(char));
cudaMallocManaged(&af32, N * sizeof(float));
cudaMallocManaged(&bf32, N * sizeof(float));
cudaMallocManaged(&cf32, N * sizeof(float));
// Initialize input arrays
for (int i = 0; i < N; ++i) {
a32[i] = i;
b32[i] = i + 1;
a16[i] = static_cast<short>(i);
b16[i] = static_cast<short>(i + 1);
a8[i] = static_cast<char>(i);
b8[i] = static_cast<char>(i + 1);
af32[i] = static_cast<float>(i);
bf32[i] = static_cast<float>(i + 1);
}
// Energy measurement setup
nvmlDevice_t device;
start_energy_measurement(device);
// Arrays to store time and energy results
float time_ps[12], energy_joules[12]; // We have 12 operations to measure (3 add, 3 mul, 3 div, 3 sin)
// ------------- Measure and store results -------------
// ADDITION
run_and_measure(add32_kernel, a32, b32, c32, N, blocksPerGrid, device, time_ps[0], energy_joules[0]);
.....
return 0;
}
However, I do not trust the results that I get:
Execution Time (ns):
Operation 32-bit 16-bit 8-bit Float
Addition 9.16976 5.5709 3.89919 9.17013
Multiplication 9.20029 5.58998 3.89656 9.16062
Division 9.13221 6.33858 4.75525 9.07272
Sine (standard) - - - 9.9
Sine (intrinsic) - - - 6.51939
Energy Consumption (mJ):
Operation 32-bit 16-bit 8-bit Float
Addition 9.9 6.51939 3.686 2.961
Multiplication 3.01 3.558 5.59 3.499
Division 3.766 3.949 2.613 2.888
Sine (standard) - - - 4.154
Sine (intrinsic) - - - 3.112
Can you please comment on my approach? Is it even necessary to do it like this to get the energy and latency numbers?
If yes, what am I doing incorrectly that I get larger figures for Addition compared to Sin in the latency table.
Also, the energy figures for “Addition32/16” seem incorrect to me! they are 2-3x larger than the rest!
Thank you for your comments in advance.