How to test FP64 (no tensor core) in A100

The value specified in the A100 specification for FP64 (without sensor core) is approximately 9.7 TFLOPS. How can we test to obtain this value?

Peak numbers generally cannot be obtained in a measurement.

To get as close as possible to that number, you would probably want to write a “synthetic” code that did as many back-to-back DFMA operations as possible, using registers as much as possible, not shared memory nor global memory.

This can be somewhat tricky to get the compiler to do what you want, when you have an objective like this in mind. Here’s an example that I came up with:

# cat t428.cu
#include <iostream>

const int cnt = 8;
//const int lps = 256;
__global__ void k(double *d, int lps){

  double a[cnt];
  double b[cnt];
  for (int i = 0; i < cnt; i++) {a[i] = d[i]; b[i] = d[i+1];}
  double sum = 0;
  for (int i = 0; i < lps; i++)
    for (int j = 0; j < cnt; j++)
      sum += a[j]*b[(i)&(cnt-1)];
  d[0] = sum;
}

int main(){
  double *d;
  cudaMalloc(&d, (cnt+1) * sizeof(double));
  k<<<58*3, 512>>>(d, 256);
  cudaDeviceSynchronize();
}

# nvcc -Xptxas=-v t428.cu -o t428 -arch=sm_89 -lineinfo
ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function '_Z1kPdi' for 'sm_89'
ptxas info    : Function properties for _Z1kPdi
    64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 36 registers, used 0 barriers, 64 bytes cumulative stack size, 364 bytes cmem[0]
ptxas info    : Compile time = 15.155 ms
# cuobjdump -sass ./t428

Fatbin elf code:
================
arch = sm_89
code version = [1,8]
host = linux
compile_size = 64bit
identifier = t428.cu

        code for sm_89
        .target sm_89


Fatbin elf code:
================
arch = sm_89
code version = [1,8]
host = linux
compile_size = 64bit
identifier = t428.cu

        code for sm_89
        .target sm_89

                Function : _Z1kPdi
        .headerflags    @"EF_CUDA_SM89 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM89)"
        /*0000*/                   IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ;    /* 0x00000a00ff017624 */
                                                                              /* 0x000fe400078e00ff */
        /*0010*/                   IMAD.MOV.U32 R20, RZ, RZ, c[0x0][0x160] ;  /* 0x00005800ff147624 */
                                                                              /* 0x000fe200078e00ff */
        /*0020*/                   ULDC.64 UR4, c[0x0][0x118] ;               /* 0x0000460000047ab9 */
                                                                              /* 0x000fe20000000a00 */
        /*0030*/                   IMAD.MOV.U32 R21, RZ, RZ, c[0x0][0x164] ;  /* 0x00005900ff157624 */
                                                                              /* 0x000fe200078e00ff */
        /*0040*/                   IADD3 R1, R1, -0x40, RZ ;                  /* 0xffffffc001017810 */
                                                                              /* 0x000fc80007ffe0ff */
        /*0050*/                   LDG.E.64 R4, [R20.64+0x38] ;               /* 0x0000380414047981 */
                                                                              /* 0x000ea8000c1e1b00 */
        /*0060*/                   LDG.E.64 R6, [R20.64+0x40] ;               /* 0x0000400414067981 */
                                                                              /* 0x000ea8000c1e1b00 */
        /*0070*/                   LDG.E.64 R8, [R20.64+0x8] ;                /* 0x0000080414087981 */
                                                                              /* 0x000ee8000c1e1b00 */
        /*0080*/                   LDG.E.64 R10, [R20.64+0x10] ;              /* 0x00001004140a7981 */
                                                                              /* 0x000ee8000c1e1b00 */
        /*0090*/                   LDG.E.64 R12, [R20.64+0x18] ;              /* 0x00001804140c7981 */
                                                                              /* 0x000f28000c1e1b00 */
        /*00a0*/                   LDG.E.64 R14, [R20.64+0x20] ;              /* 0x00002004140e7981 */
                                                                              /* 0x000f28000c1e1b00 */
        /*00b0*/                   LDG.E.64 R16, [R20.64+0x28] ;              /* 0x0000280414107981 */
                                                                              /* 0x000f68000c1e1b00 */
        /*00c0*/                   LDG.E.64 R18, [R20.64+0x30] ;              /* 0x0000300414127981 */
                                                                              /* 0x000f62000c1e1b00 */
        /*00d0*/                   IMAD.MOV.U32 R0, RZ, RZ, c[0x0][0x168] ;   /* 0x00005a00ff007624 */
                                                                              /* 0x000fe200078e00ff */
        /*00e0*/                   CS2R R2, SRZ ;                             /* 0x0000000000027805 */
                                                                              /* 0x000fc8000001ff00 */
        /*00f0*/                   ISETP.GE.AND P0, PT, R0, 0x1, PT ;         /* 0x000000010000780c */
                                                                              /* 0x000fe20003f06270 */
        /*0100*/                   STL.128 [R1+0x30], R4 ;                    /* 0x0000300401007387 */
                                                                              /* 0x0041e80000100c00 */
        /*0110*/                   STL.128 [R1], R8 ;                         /* 0x0000000801007387 */
                                                                              /* 0x0081e80000100c00 */
        /*0120*/                   STL.128 [R1+0x10], R12 ;                   /* 0x0000100c01007387 */
                                                                              /* 0x0101e80000100c00 */
        /*0130*/                   STL.128 [R1+0x20], R16 ;                   /* 0x0000201001007387 */
                                                                              /* 0x0201e20000100c00 */
        /*0140*/              @!P0 BRA 0x620 ;                                /* 0x000004d000008947 */
                                                                              /* 0x000fea0003800000 */
        /*0150*/                   IADD3 R2, R0.reuse, -0x1, RZ ;             /* 0xffffffff00027810 */
                                                                              /* 0x040fe20007ffe0ff */
        /*0160*/                   LDG.E.64 R6, [R20.64] ;                    /* 0x0000000414067981 */
                                                                              /* 0x001162000c1e1b00 */
        /*0170*/                   LOP3.LUT P0, RZ, R0, 0x3, RZ, 0xc0, !PT ;  /* 0x0000000300ff7812 */
                                                                              /* 0x000fe2000780c0ff */
        /*0180*/                   IMAD.MOV.U32 R22, RZ, RZ, RZ ;             /* 0x000000ffff167224 */
                                                                              /* 0x000fe200078e00ff */
        /*0190*/                   ISETP.GE.U32.AND P1, PT, R2, 0x3, PT ;     /* 0x000000030200780c */
                                                                              /* 0x000fc40003f26070 */
        /*01a0*/                   LOP3.LUT R23, R0, 0x3, RZ, 0xc0, !PT ;     /* 0x0000000300177812 */
                                                                              /* 0x000fe200078ec0ff */
        /*01b0*/                   CS2R R2, SRZ ;                             /* 0x0000000000027805 */
                                                                              /* 0x000fd4000001ff00 */
        /*01c0*/              @!P1 BRA 0x510 ;                                /* 0x0000034000009947 */
                                                                              /* 0x000fea0003800000 */
        /*01d0*/                   IADD3 R32, -R23, c[0x0][0x168], RZ ;       /* 0x00005a0017207a10 */
                                                                              /* 0x001fe20007ffe1ff */
        /*01e0*/                   IMAD.MOV.U32 R22, RZ, RZ, RZ ;             /* 0x000000ffff167224 */
                                                                              /* 0x000fe200078e00ff */
        /*01f0*/                   CS2R R2, SRZ ;                             /* 0x0000000000027805 */
                                                                              /* 0x000fc6000001ff00 */
        /*0200*/                   IMAD.SHL.U32 R28, R22, 0x8, RZ ;           /* 0x00000008161c7824 */
                                                                              /* 0x001fca00078e00ff */
        /*0210*/                   LOP3.LUT R0, R28, 0x20, RZ, 0xc0, !PT ;    /* 0x000000201c007812 */
                                                                              /* 0x000fca00078ec0ff */
        /*0220*/                   IMAD.IADD R24, R1, 0x1, R0 ;               /* 0x0000000101187824 */
                                                                              /* 0x000fcc00078e0200 */
        /*0230*/                   LDL.64 R24, [R24] ;                        /* 0x0000000018187983 */
                                                                              /* 0x000ea20000100a00 */
        /*0240*/                   IADD3 R0, R28, 0x8, RZ ;                   /* 0x000000081c007810 */
                                                                              /* 0x000fc80007ffe0ff */
        /*0250*/                   LOP3.LUT R0, R0, 0x28, RZ, 0xc0, !PT ;     /* 0x0000002800007812 */
                                                                              /* 0x000fca00078ec0ff */
        /*0260*/                   IMAD.IADD R33, R1, 0x1, R0 ;               /* 0x0000000101217824 */
                                                                              /* 0x000fca00078e0200 */
        /*0270*/                   LDL.64 R26, [R33] ;                        /* 0x00000000211a7983 */
                                                                              /* 0x000ee20000100a00 */
        /*0280*/                   IADD3 R0, R28, 0x10, RZ ;                  /* 0x000000101c007810 */
                                                                              /* 0x000fc60007ffe0ff */
        /*0290*/                   LDL.64 R28, [R33+0x10] ;                   /* 0x00001000211c7983 */
                                                                              /* 0x000f220000100a00 */
        /*02a0*/                   LOP3.LUT R0, R0, 0x30, RZ, 0xc0, !PT ;     /* 0x0000003000007812 */
                                                                              /* 0x000fca00078ec0ff */
        /*02b0*/                   IMAD.IADD R0, R1, 0x1, R0 ;                /* 0x0000000101007824 */
                                                                              /* 0x000fca00078e0200 */
        /*02c0*/                   LDL.64 R30, [R0] ;                         /* 0x00000000001e7983 */
                                                                              /* 0x000ee20000100a00 */
        /*02d0*/                   IADD3 R32, R32, -0x4, RZ ;                 /* 0xfffffffc20207810 */
                                                                              /* 0x000fc80007ffe0ff */
        /*02e0*/                   ISETP.NE.AND P1, PT, R32, RZ, PT ;         /* 0x000000ff2000720c */
                                                                              /* 0x000fe40003f25270 */
        /*02f0*/                   IADD3 R22, R22, 0x4, RZ ;                  /* 0x0000000416167810 */
                                                                              /* 0x000fe20007ffe0ff */
        /*0300*/                   DFMA R2, R6, R24, R2 ;                     /* 0x000000180602722b */
                                                                              /* 0x026e0c0000000002 */
        /*0310*/                   DFMA R2, R8, R24, R2 ;                     /* 0x000000180802722b */
                                                                              /* 0x001e0c0000000002 */
        /*0320*/                   DFMA R2, R10, R24, R2 ;                    /* 0x000000180a02722b */
                                                                              /* 0x001e0c0000000002 */
        /*0330*/                   DFMA R2, R12, R24, R2 ;                    /* 0x000000180c02722b */
                                                                              /* 0x001e0c0000000002 */
        /*0340*/                   DFMA R2, R14, R24, R2 ;                    /* 0x000000180e02722b */
                                                                              /* 0x001e0c0000000002 */
        /*0350*/                   DFMA R2, R16, R24, R2 ;                    /* 0x000000181002722b */
                                                                              /* 0x001e0c0000000002 */
        /*0360*/                   DFMA R2, R18, R24, R2 ;                    /* 0x000000181202722b */
                                                                              /* 0x001e0c0000000002 */
        /*0370*/                   DFMA R2, R4, R24, R2 ;                     /* 0x000000180402722b */
                                                                              /* 0x001ecc0000000002 */
        /*0380*/                   DFMA R2, R6, R26, R2 ;                     /* 0x0000001a0602722b */
                                                                              /* 0x008e0c0000000002 */
        /*0390*/                   DFMA R2, R8, R26, R2 ;                     /* 0x0000001a0802722b */
                                                                              /* 0x001e0c0000000002 */
        /*03a0*/                   DFMA R2, R10, R26, R2 ;                    /* 0x0000001a0a02722b */
                                                                              /* 0x001e0c0000000002 */
        /*03b0*/                   DFMA R2, R12, R26, R2 ;                    /* 0x0000001a0c02722b */
                                                                              /* 0x001e0c0000000002 */
        /*03c0*/                   DFMA R2, R14, R26, R2 ;                    /* 0x0000001a0e02722b */
                                                                              /* 0x001e0c0000000002 */
        /*03d0*/                   DFMA R2, R16, R26, R2 ;                    /* 0x0000001a1002722b */
                                                                              /* 0x001e0c0000000002 */
        /*03e0*/                   DFMA R2, R18, R26, R2 ;                    /* 0x0000001a1202722b */
                                                                              /* 0x001e0c0000000002 */
        /*03f0*/                   DFMA R2, R4, R26, R2 ;                     /* 0x0000001a0402722b */
                                                                              /* 0x001e0c0000000002 */
        /*0400*/                   DFMA R2, R6, R30, R2 ;                     /* 0x0000001e0602722b */
                                                                              /* 0x001e0c0000000002 */
        /*0410*/                   DFMA R2, R8, R30, R2 ;                     /* 0x0000001e0802722b */
                                                                              /* 0x001e0c0000000002 */
        /*0420*/                   DFMA R2, R10, R30, R2 ;                    /* 0x0000001e0a02722b */
                                                                              /* 0x001e0c0000000002 */
        /*0430*/                   DFMA R2, R12, R30, R2 ;                    /* 0x0000001e0c02722b */
                                                                              /* 0x001e0c0000000002 */
        /*0440*/                   DFMA R2, R14, R30, R2 ;                    /* 0x0000001e0e02722b */
                                                                              /* 0x001e0c0000000002 */
        /*0450*/                   DFMA R2, R16, R30, R2 ;                    /* 0x0000001e1002722b */
                                                                              /* 0x001e0c0000000002 */
        /*0460*/                   DFMA R2, R18, R30, R2 ;                    /* 0x0000001e1202722b */
                                                                              /* 0x001e0c0000000002 */
        /*0470*/                   DFMA R2, R4, R30, R2 ;                     /* 0x0000001e0402722b */
                                                                              /* 0x001f0c0000000002 */
        /*0480*/                   DFMA R2, R6, R28, R2 ;                     /* 0x0000001c0602722b */
                                                                              /* 0x010e0c0000000002 */
        /*0490*/                   DFMA R2, R8, R28, R2 ;                     /* 0x0000001c0802722b */
                                                                              /* 0x001e0c0000000002 */
        /*04a0*/                   DFMA R2, R10, R28, R2 ;                    /* 0x0000001c0a02722b */
                                                                              /* 0x001e0c0000000002 */
        /*04b0*/                   DFMA R2, R12, R28, R2 ;                    /* 0x0000001c0c02722b */
                                                                              /* 0x001e0c0000000002 */
        /*04c0*/                   DFMA R2, R14, R28, R2 ;                    /* 0x0000001c0e02722b */
                                                                              /* 0x001e0c0000000002 */
        /*04d0*/                   DFMA R2, R16, R28, R2 ;                    /* 0x0000001c1002722b */
                                                                              /* 0x001e0c0000000002 */
        /*04e0*/                   DFMA R2, R18, R28, R2 ;                    /* 0x0000001c1202722b */
                                                                              /* 0x001e0c0000000002 */
        /*04f0*/                   DFMA R2, R4, R28, R2 ;                     /* 0x0000001c0402722b */
                                                                              /* 0x0010620000000002 */
        /*0500*/               @P1 BRA 0x200 ;                                /* 0xfffffcf000001947 */
                                                                              /* 0x000fea000383ffff */
        /*0510*/              @!P0 BRA 0x620 ;                                /* 0x0000010000008947 */
                                                                              /* 0x001fea0003800000 */
        /*0520*/                   IMAD.SHL.U32 R0, R22, 0x8, RZ ;            /* 0x0000000816007824 */
                                                                              /* 0x000fca00078e00ff */
        /*0530*/                   LOP3.LUT R0, R0, 0x38, RZ, 0xc0, !PT ;     /* 0x0000003800007812 */
                                                                              /* 0x000fca00078ec0ff */
        /*0540*/                   IMAD.IADD R0, R1, 0x1, R0 ;                /* 0x0000000101007824 */
                                                                              /* 0x000fca00078e0200 */
        /*0550*/                   LDL.64 R24, [R0] ;                         /* 0x0000000000187983 */
                                                                              /* 0x001ea20000100a00 */
        /*0560*/                   IADD3 R23, R23, -0x1, RZ ;                 /* 0xffffffff17177810 */
                                                                              /* 0x000fe40007ffe0ff */
        /*0570*/                   IADD3 R22, R22, 0x1, RZ ;                  /* 0x0000000116167810 */
                                                                              /* 0x000fe40007ffe0ff */
        /*0580*/                   ISETP.NE.AND P0, PT, R23, RZ, PT ;         /* 0x000000ff1700720c */
                                                                              /* 0x000fe20003f05270 */
        /*0590*/                   DFMA R2, R6, R24, R2 ;                     /* 0x000000180602722b */
                                                                              /* 0x026e0c0000000002 */
        /*05a0*/                   DFMA R2, R8, R24, R2 ;                     /* 0x000000180802722b */
                                                                              /* 0x001e0c0000000002 */
        /*05b0*/                   DFMA R2, R10, R24, R2 ;                    /* 0x000000180a02722b */
                                                                              /* 0x001e0c0000000002 */
        /*05c0*/                   DFMA R2, R12, R24, R2 ;                    /* 0x000000180c02722b */
                                                                              /* 0x001e0c0000000002 */
        /*05d0*/                   DFMA R2, R14, R24, R2 ;                    /* 0x000000180e02722b */
                                                                              /* 0x001e0c0000000002 */
        /*05e0*/                   DFMA R2, R16, R24, R2 ;                    /* 0x000000181002722b */
                                                                              /* 0x001e0c0000000002 */
        /*05f0*/                   DFMA R2, R18, R24, R2 ;                    /* 0x000000181202722b */
                                                                              /* 0x001e0c0000000002 */
        /*0600*/                   DFMA R2, R4, R24, R2 ;                     /* 0x000000180402722b */
                                                                              /* 0x0010620000000002 */
        /*0610*/               @P0 BRA 0x520 ;                                /* 0xffffff0000000947 */
                                                                              /* 0x000fea000383ffff */
        /*0620*/                   NOP ;                                      /* 0x0000000000007918 */
                                                                              /* 0x000fc20000000000 */
        /*0630*/                   STG.E.64 [R20.64], R2 ;                    /* 0x0000000214007986 */
                                                                              /* 0x002fe2000c101b04 */
        /*0640*/                   EXIT ;                                     /* 0x000000000000794d */
                                                                              /* 0x000fea0003800000 */
        /*0650*/                   BRA 0x650;                                 /* 0xfffffff000007947 */
                                                                              /* 0x000fc0000383ffff */
        /*0660*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0670*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0680*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0690*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*06a0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*06b0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*06c0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*06d0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*06e0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*06f0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
                ..........



Fatbin ptx code:
================
arch = sm_89
code version = [9,0]
host = linux
compile_size = 64bit
compressed
identifier = t428.cu
ptxasOptions = -v  --generate-line-info
#

That has about 32 DFMA in a row, inside a loop. Unfortunately there’s roughly 16 other instructions in the loop body, so this isn’t going to be ideal.

But fiddling with that code may give you some ideas.

The classical method is to invoke CUBLAS DGEMM on large square matrices, using the most favorable combination of transposition modes, which is often N,T (you might want to try all four combinations to make sure). I don’t have an A100 at my disposal to tell you how well this approach works on that platform. Realistically, observed peak performance will likely top out at around 85% of theoretical performance.

A100 has an FP64 tensor core path. I think CUBLAS would aggressively try to use it for DGEMM. The question seems to be asking how to avoid use of the tensor core path.

On my L4 GPU (not a very good test case, for a few reasons) this code reports 460GF/s vs. a 473GF/s specification for FP64:

# cat t428.cu
#include <iostream>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

unsigned long long dtime_usec(unsigned long long start=0){

    timeval tv;
    gettimeofday(&tv, 0);
    return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

const int cnt = 8;
__global__ void k(double *d, int lps){

  double a[cnt];
  double b[cnt];
  for (int i = 0; i < cnt; i++) {a[i] = d[i]; b[i] = d[i+1];}
  double sum = 0;
  for (int i = 0; i < lps; i++)
    for (int j = 0; j < cnt; j++)
      sum += a[j]*b[(i)&(cnt-1)];
  d[0] = sum;
}

int main(){
  const int blocks = 58*3;
  const int threads = 512;
  const int loops = 256;
  double *d;
  cudaMalloc(&d, (cnt+1) * sizeof(double));
  k<<<blocks, threads>>>(d, 2);
  cudaDeviceSynchronize();
  unsigned long long dt = dtime_usec(0);
  k<<<blocks, threads>>>(d, loops);
  cudaDeviceSynchronize();
  dt = dtime_usec(dt);
  unsigned long long num_ops = (unsigned long long)blocks*threads*loops*cnt*2;
  float mflops = num_ops/(float)dt;
  std::cout << mflops/1000 << "GF/s" << std::endl;
}

# nvcc -Xptxas=-v t428.cu -o t428 -arch=sm_89 -lineinfo
ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function '_Z1kPdi' for 'sm_89'
ptxas info    : Function properties for _Z1kPdi
    64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 36 registers, used 0 barriers, 64 bytes cumulative stack size, 364 bytes cmem[0]
ptxas info    : Compile time = 15.392 ms
# ./t428
460.157GF/s
#

I stand corrected. I mistakenly thought tensor cores are only used for mixed-precision computation.

I guess anything that looks like a dot product could be transformed to use tensor cores, so maybe a microbenchmark that evaluates polynomials in parallel using the Horner scheme might be suitable, as long as the coefficients can be coded as literal constants that are incorporated into each DFMA instruction (generally this requires the least significant 32 bits of the FP64 operand to be zero). Here is a program that uses this approach.

Adjust LEN (number of polynomials to be evaluated). The number currently dialed in was chosen to keep kernel execution time under 2 seconds on my Quadro RTX 4000, which produces this output:

CUDA initialization: 0.123 seconds
threadblocks=786432 threads/block=128
mintime = 1934.291 msec  FP64 GFLOPS=    2.13161731e+02
#include <stdlib.h>
#include <stdio.h>
#include <math.h>

#define THREADS   (128)
#define LEN       (1024*1024*96)
#define ITERS     (3)

constexpr int N = 2048;

__device__ double horner (double x)
{
    double p = 1.0;
#pragma unroll N
    for (int i = 0; i < N; i++) {
        p = fma (p, x, 1.0 + i / 32768.0);
    }
    return p;
}

__global__ void fp64_throughput (const double * __restrict__ src, 
                                 double * __restrict__ dst, int len)
{
    int stride = gridDim.x * blockDim.x;
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    for (int i = tid; i < len; i += stride) {
        dst[i] = horner (src[i]);
    }
}
// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call)                                          \
do {                                                                  \
    cudaError_t err = call;                                           \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR()                                          \
do {                                                                  \
    /* Check synchronous errors, i.e. pre-launch */                   \
    cudaError_t err = cudaGetLastError();                             \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
    /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
    err = cudaDeviceSynchronize();                                    \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString( err) );      \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

#if defined(_WIN32)
#if !defined(WIN32_LEAN_AND_MEAN)
#define WIN32_LEAN_AND_MEAN
#endif
#include <windows.h>
double second (void)
{
    LARGE_INTEGER t;
    static double oofreq;
    static int checkedForHighResTimer;
    static BOOL hasHighResTimer;

    if (!checkedForHighResTimer) {
        hasHighResTimer = QueryPerformanceFrequency (&t);
        oofreq = 1.0 / (double)t.QuadPart;
        checkedForHighResTimer = 1;
    }
    if (hasHighResTimer) {
        QueryPerformanceCounter (&t);
        return (double)t.QuadPart * oofreq;
    } else {
        return (double)GetTickCount() * 1.0e-3;
    }
}
#elif defined(__linux__) || defined(__APPLE__)
#include <stddef.h>
#include <sys/time.h>
double second (void)
{
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return (double)tv.tv_sec + (double)tv.tv_usec * 1.0e-6;
}
#else
#error unsupported platform
#endif

int main (void)
{
    double start, stop, init_start, init_stop, elapsed, mintime;
    double *d_a, *d_b;
    
    /* Trigger CUDA context creation */
    init_start = second();
    CUDA_SAFE_CALL (cudaFree (0));
    init_stop = second();
    printf ("CUDA initialization: %.3f seconds\n", init_stop - init_start);

    /* Allocate memory on device */
    CUDA_SAFE_CALL (cudaMalloc((void**)&d_a, sizeof(d_a[0]) * LEN));
    CUDA_SAFE_CALL (cudaMalloc((void**)&d_b, sizeof(d_b[0]) * LEN));
    
    /* Initialize device memory */
    CUDA_SAFE_CALL (cudaMemset(d_a, 0x00, sizeof(d_a[0]) * LEN)); // zero
    CUDA_SAFE_CALL (cudaMemset(d_b, 0xff, sizeof(d_b[0]) * LEN)); // NaN

    /* Compute execution configuration */
    dim3 dimBlock(THREADS);
    int threadBlocks = (LEN + (dimBlock.x - 1)) / dimBlock.x;
    dim3 dimGrid(threadBlocks);
    printf ("threadblocks=%d threads/block=%d\n", threadBlocks, THREADS);
    mintime = fabs(log(0.0));
    for (int k = 0; k < ITERS; k++) {
        start = second();
        fp64_throughput<<<dimGrid,dimBlock>>>(d_a, d_b, LEN);
        CHECK_LAUNCH_ERROR();
        stop = second();
        elapsed = stop - start;
        if (elapsed < mintime) mintime = elapsed;
    }
    printf ("mintime = %.3f msec  FP64 GFLOPS=%18.8e\n", 1.0e3 * mintime,
            2.0 * LEN * N / mintime / 1e9);

    CUDA_SAFE_CALL (cudaFree(d_a));
    CUDA_SAFE_CALL (cudaFree(d_b));

    return EXIT_SUCCESS;
}
1 Like

Thanks,The value of this test is now very close to the specification indicated in the document.