Unable to reach full throughput on Titan X Pascal for __dp4a

Hi,
I am trying out dp4a on Titan X Pascal, for fun I wrote a benchmark to check the limits of hardware. Unfortunately, I couldn’t get 44 TOPS on it.

Source: https://blogs.nvidia.com/blog/2016/07/21/titan-x/

The number I am getting is 0.5TOPS. (1/100th of peak).

Here is the source code:

#include<cuda.h>
#include<cuda_runtime_api.h>
#include<iostream>
#include<assert.h>

#define LEN 1024*1024*128
// 128M elements
#define SIZE LEN*4
// 512MB

#define ITER 0

__global__ void DoDP4A(unsigned int *In1, unsigned int *In2, unsigned int *In3, unsigned int *Out) {
    unsigned tid = threadIdx.x + blockIdx.x * blockDim.x;
    Out[tid] = __dp4a(In1[tid], In2[tid], In3[tid]);
    for(unsigned i=0;i<ITER; i++){
        Out[tid] = __dp4a(In1[tid], In2[tid], Out[tid]);
    }
}

#define val 0x01010101

int main() {
    unsigned *In1 = new unsigned[LEN];
    unsigned *In2 = new unsigned[LEN];
    unsigned *In3 = new unsigned[LEN];
    unsigned *Out = new unsigned[LEN];
    for(unsigned i=0;i<LEN;i++) {
        In1[i] = val;
        In2[i] = val;
        In3[i] = 0;
        Out[i] = 0;
    }
    cudaSetDevice(1);
    unsigned *In1d, *In2d, *In3d, *Outd;
    cudaMalloc((void**)&In1d, SIZE);
    cudaMalloc((void**)&In2d, SIZE);
    cudaMalloc((void**)&In3d, SIZE);
    cudaMalloc((void**)&Outd, SIZE);
    cudaMemcpy(In1d, In1, SIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(In2d, In2, SIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(In3d, In3, SIZE, cudaMemcpyHostToDevice);
    struct timespec start;
    struct timespec stop;
    clock_gettime(CLOCK_MONOTONIC, &start);
    for(unsigned i=0;i<1024;i++){ 
    DoDP4A<<<dim3(LEN/1024,1,1), dim3(1024,1,1)>>>(In1d, In2d, In3d, Outd);
    }
    cudaDeviceSynchronize();
    
    clock_gettime(CLOCK_MONOTONIC, &stop);
    std::cout<<stop.tv_sec-start.tv_sec<<" for "<<(SIZE/(1024*1024))*8*1024<<" Mops"<<std::endl;
    cudaMemcpy(Out, Outd, SIZE, cudaMemcpyDeviceToHost);
    for(unsigned i=0;i<LEN;i++) {
        assert(Out[i] == 4*(ITER+1));
    }
}

Titan X is not default device on my system. Make sure to change it for testing.

Your code seems to be entirely memory bound, rather than compute bound.

45 Tops on Pascal titan X … assuming I didn’t make any mistakes …

$ cat t42.cu
#include<iostream>
#include <stdio.h>

#define ITER 1024
#define SSZ 512
#define BSZ 512


#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

unsigned long long dtime_usec(unsigned long long start){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

__global__ void DoDP4A() {
    __shared__ int data1[SSZ];
    __shared__ int data2[SSZ];
    __shared__ int data3[SSZ];
    int out;
    for (int i = 0; i < ITER; i++)
      out += __dp4a(data1[threadIdx.x], data2[threadIdx.x], data3[threadIdx.x]);
    if (!out) printf("out = %d\n", out);
}


int main() {
    DoDP4A<<<BSZ, SSZ>>>();  // warm up
    unsigned long long dt = dtime_usec(0);
    DoDP4A<<<BSZ*ITER, SSZ>>>();
    cudaDeviceSynchronize();
    dt = dtime_usec(dt);
    unsigned long long ops = ITER;
    ops *= ITER;
    ops *= SSZ;
    ops *= BSZ;
    float et = dt/(float)USECPSEC;
    unsigned long long Mops = ops/1000000;
    std::cout<<et<<"s for "<< Mops << " Mdp4a"<<std::endl;
    float tp = (Mops*8)/(et*1000000);
    std::cout << "throughput: " << tp << " Tops/s" << std::endl;
}
$ nvcc -arch=sm_61 -o t42 t42.cu
$ cuda-memcheck ./t42
========= CUDA-MEMCHECK
0.113676s for 274877 Mdp4a
throughput: 19.3446 Tops/s
========= ERROR SUMMARY: 0 errors
$ ./t42
0.048664s for 274877 Mdp4a
throughput: 45.1877 Tops/s
$

Hi,
Thank you for the code. It’s beautiful.
Why use shared memory? To get better throughput, we can use all the registers (256 or 200 something like that.) ??

no particular reason. shared memory here is pretty much irrelevant. the compiler will optimize it into a register anyway (after the first shared memory load).

I didn’t want to have to deal with any loads from global memory, nor did I want to use any compile-time constants. I’m sure there are other ways to skin the cat.

PETA won’t approve that.

I’m sure there are other ways to skin a kumquat.

Fixed.

Hi,
I was able to achieve 44TOPs using the following code.

BSZ is 7 * 4 because 7512 is total number of cuda cores on Titan X Pascal. 4 adds to ITER. (ITER4). Making it to 8,16 staturated the score at 46.

#include<iostream>
#include <stdio.h>

#define ITER 1024*1024*1024
#define SSZ 512
#define BSZ 7*16


#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

unsigned long long dtime_usec(unsigned long long start){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

__global__ void DoDP4A(int *in1d, int *in2d, int* in3d, int* outd) {
    int tx = threadIdx.x;
    int in1 = in1d[tx];
    int in2 = in2d[tx];
    int in3 = in3d[tx];
    int out;
    for (int i = 0; i < ITER; i++) {
      out += __dp4a(in1, in2, in3);
    }
    outd[tx] = out;
}


int main() {
    cudaSetDevice(2);
    int *in1d, *in2d, *in3d, *outd;
    cudaMalloc((void**)&in1d, SSZ*4);
    cudaMalloc((void**)&in2d, SSZ*4);
    cudaMalloc((void**)&in3d, SSZ*4);
    cudaMalloc((void**)&outd, SSZ*4);
    DoDP4A<<<1, SSZ>>>(in1d, in2d, in3d, outd);  // warm up
    cudaDeviceSynchronize();
    unsigned long long dt = dtime_usec(0);
    DoDP4A<<<BSZ, SSZ>>>(in1d, in2d, in3d, outd);
    cudaDeviceSynchronize();
    dt = dtime_usec(dt);
    unsigned long long ops = ITER;
    ops *= BSZ;
    ops *= SSZ;
    float et = dt/(float)USECPSEC;
    unsigned long long Mops = ops/1000000;
    std::cout<<et<<"s for "<< Mops << " Mdp4a"<<std::endl;
    float tp = (Mops*8)/(et*1000000);
    std::cout << "throughput: " << tp << " Tops/s" << std::endl;
}

Hello,
I want to use __dp4a to multiply 2 numbers using 4 8-bit multiiiplications and want to compare it without dp4a.
How can I do it ?

Thanks