i want to verify cuda core peak fp32 throughput on rtx3090, eg. 32-bit floatingpoint add is 128 on sm86.(cuda c programming guide 12.3 chapter 8.4.1).

from white paper we know boost clock is 1.7G, so i set cuda core clock to 1700Mhz.

nvidia-smi --lock-gpu-clocks=1700

3090 fp32 add through is 128(per sm) * 82 (sm_number) * 1.7 /1000=17.8 TFLOPS. (fma need multiply 2 get 35.6, same as 3090 spec).

i write a program like below, to verify cuda core fp32 add through:

**global** void gpu_compute_peak(float* in, float* out, long long int *clock, unsigned int *loop_time)

{

int idx = threadIdx.x + blockIdx.x * blockDim.x;

if(idx < NUM_ELEMENTS) {

float input = in[idx];

unsigned int loop = loop_time[idx];

long long int start, end;

unsigned int i=0;

asm(â€ś.reg .u32 t1;\n\tâ€ť

â€ś.reg .f32 f1;\n\tâ€ť

â€ś.reg .f32 f2;\n\tâ€ť

â€śmov.u32 t1, %3;\n\tâ€ť

â€śmov.f32 f1, %4;\n\tâ€ť

â€śmov.u64 %1, %%clock64;\n\tâ€ť

â€śmov.f32 f2, 0f3F000000;\n\tâ€ť

â€ś$__LOOP:\n\tâ€ť

â€śadd.u32 t1, t1, 8;\n\tâ€ť

â€śadd.f32 f1, f1, 0f3F000000;\n\tâ€ť

â€śadd.f32 f1, f1, 0f3F800000;\n\tâ€ť

â€śadd.f32 f1, f1, 0f3F000000;\n\tâ€ť

â€śadd.f32 f1, f1, 0f3F800000;\n\tâ€ť

â€śadd.f32 f1, f1, 0f3F000000;\n\tâ€ť

â€śadd.f32 f1, f1, 0f3F800000;\n\tâ€ť

â€śadd.f32 f1, f1, 0f3F000000;\n\tâ€ť

â€śadd.f32 f1, f1, 0f3F800000;\n\tâ€ť

â€śsetp.ne.s32 %%p1, t1, %5;\n\tâ€ť

â€ś@%%p1 bra $__LOOP;\n\tâ€ť

â€śmov.u64 %2, %%clock64;\n\tâ€ť

â€śmov.f32 %0, f1;\n\tâ€ť

:â€ś=fâ€ť(input), â€ś=lâ€ť(start), â€ś=lâ€ť(end): â€śrâ€ť(i), â€śfâ€ť(input), â€śrâ€ť(loop)

);

clock[idx] = end - start;

out[idx] = input;

}

}

loop_time always 0x10000, NUM_ELEMENTS equal thread number which is 10496.

loop_time * number_thread / clock delta * boost clock /1000

i get 4.2 TFLOPSďĽŚ which is far less than 17.8. i donâ€™t know where i get wrong, can some one help me ? attachment is the test code.

many thanks.

perf_test.txt (4.0 KB)