Hi,
I am trying out dp4a on Titan X Pascal, for fun I wrote a benchmark to check the limits of hardware. Unfortunately, I couldn’t get 44 TOPS on it.
Source: GPU Archives | NVIDIA Blog
The number I am getting is 0.5TOPS. (1/100th of peak).
Here is the source code:
#include<cuda.h>
#include<cuda_runtime_api.h>
#include<iostream>
#include<assert.h>
#define LEN 1024*1024*128
// 128M elements
#define SIZE LEN*4
// 512MB
#define ITER 0
__global__ void DoDP4A(unsigned int *In1, unsigned int *In2, unsigned int *In3, unsigned int *Out) {
unsigned tid = threadIdx.x + blockIdx.x * blockDim.x;
Out[tid] = __dp4a(In1[tid], In2[tid], In3[tid]);
for(unsigned i=0;i<ITER; i++){
Out[tid] = __dp4a(In1[tid], In2[tid], Out[tid]);
}
}
#define val 0x01010101
int main() {
unsigned *In1 = new unsigned[LEN];
unsigned *In2 = new unsigned[LEN];
unsigned *In3 = new unsigned[LEN];
unsigned *Out = new unsigned[LEN];
for(unsigned i=0;i<LEN;i++) {
In1[i] = val;
In2[i] = val;
In3[i] = 0;
Out[i] = 0;
}
cudaSetDevice(1);
unsigned *In1d, *In2d, *In3d, *Outd;
cudaMalloc((void**)&In1d, SIZE);
cudaMalloc((void**)&In2d, SIZE);
cudaMalloc((void**)&In3d, SIZE);
cudaMalloc((void**)&Outd, SIZE);
cudaMemcpy(In1d, In1, SIZE, cudaMemcpyHostToDevice);
cudaMemcpy(In2d, In2, SIZE, cudaMemcpyHostToDevice);
cudaMemcpy(In3d, In3, SIZE, cudaMemcpyHostToDevice);
struct timespec start;
struct timespec stop;
clock_gettime(CLOCK_MONOTONIC, &start);
for(unsigned i=0;i<1024;i++){
DoDP4A<<<dim3(LEN/1024,1,1), dim3(1024,1,1)>>>(In1d, In2d, In3d, Outd);
}
cudaDeviceSynchronize();
clock_gettime(CLOCK_MONOTONIC, &stop);
std::cout<<stop.tv_sec-start.tv_sec<<" for "<<(SIZE/(1024*1024))*8*1024<<" Mops"<<std::endl;
cudaMemcpy(Out, Outd, SIZE, cudaMemcpyDeviceToHost);
for(unsigned i=0;i<LEN;i++) {
assert(Out[i] == 4*(ITER+1));
}
}
Titan X is not default device on my system. Make sure to change it for testing.