I was playing around with the number of threads per block, and I noticed that it seems like a lower number resulted in greater overall performance. E.g. 128 threads per block resulted in faster performance than 1024. I somehow assumed that maximizing the threads per block would have resulted in faster performance, but that’s not the case.
Could someone perhaps point me to the theoretical reason why lower threads per block results in greater performance than just going with the maximum number of threads per block?
(This is executed on a T4 on google colab. Changing the MAX_NUM_THREADS_PER_BLOCK variable from 1024 to 128 results in slightly better performance (although the performance is even more pronounced on another application I’m testing))
%%writefile TestProg.cu
#include <iostream>
#include "cuda_fp16.h"
using namespace std;
#define MAX_NUM_THREADS_PER_BLOCK 128
#define gpuErrorCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
struct foo
{
int num;
half2 *y = NULL;
half2 *m = NULL;
half2 *x = NULL;
half2 *b = NULL;
foo (int num) : num(num)
{
cudaMallocManaged(&y, sizeof(half2) * num);
cudaMallocManaged(&m, sizeof(half2) * num);
cudaMallocManaged(&x, sizeof(half2) * num);
cudaMallocManaged(&b, sizeof(half2) * num);
for (int i = 0; i < num; i++)
{
y[i] = __float2half2_rn(2.0f);
m[i] = __float2half2_rn(2.0f);
x[i] = __float2half2_rn(2.0f);
b[i] = __float2half2_rn(2.0f);
}
int deviceId;
cudaGetDevice(&deviceId);
cudaMemPrefetchAsync(y, sizeof(half2) * num, deviceId);
cudaMemPrefetchAsync(m, sizeof(half2) * num, deviceId);
cudaMemPrefetchAsync(x, sizeof(half2) * num, deviceId);
cudaMemPrefetchAsync(b, sizeof(half2) * num, deviceId);
}
__device__ inline void bar()
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < num; i += stride)
{
y[i] = m[i]*x[i] + b[i];
}
}
};
__global__ void run(foo foos, int num)
{
foos.bar();
}
int main ()
{
int num = 1024 * 1024;
foo *foos = NULL;
cudaMallocManaged(&foos, sizeof(foo));
new (foos) foo(num);
int deviceId;
cudaGetDevice(&deviceId);
cudaMemPrefetchAsync(foos, sizeof(foo), deviceId);
gpuErrorCheck(cudaPeekAtLastError());
gpuErrorCheck(cudaDeviceSynchronize());
int numberOfBlocks = (num + MAX_NUM_THREADS_PER_BLOCK - 1) / MAX_NUM_THREADS_PER_BLOCK;
run<<<numberOfBlocks, MAX_NUM_THREADS_PER_BLOCK>>>(*foos, num);
gpuErrorCheck(cudaPeekAtLastError());
gpuErrorCheck(cudaDeviceSynchronize());
run<<<numberOfBlocks, MAX_NUM_THREADS_PER_BLOCK>>>(*foos, num);
gpuErrorCheck(cudaPeekAtLastError());
gpuErrorCheck(cudaDeviceSynchronize());
return 0;
}
Performance with 128 threads:
==3726== NVPROF is profiling process 3726, command: ./TestProg
==3726== Profiling application: ./TestProg
==3726== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 100.00% 129.50us 2 64.751us 63.295us 66.207us run(foo, int)
API calls: 91.79% 136.00ms 5 27.199ms 18.820us 135.86ms cudaMallocManaged
7.24% 10.725ms 3 3.5751ms 63.126us 10.593ms cudaDeviceSynchronize
0.64% 946.13us 5 189.23us 5.1120us 757.87us cudaMemPrefetchAsync
0.21% 314.01us 2 157.01us 8.9780us 305.04us cudaLaunchKernel
0.10% 144.83us 114 1.2700us 156ns 57.261us cuDeviceGetAttribute
0.01% 12.813us 1 12.813us 12.813us 12.813us cuDeviceGetName
0.01% 10.377us 2 5.1880us 1.8500us 8.5270us cudaGetDevice
0.00% 5.0200us 1 5.0200us 5.0200us 5.0200us cuDeviceGetPCIBusId
0.00% 4.6180us 1 4.6180us 4.6180us 4.6180us cuDeviceTotalMem
0.00% 1.6790us 3 559ns 231ns 1.1630us cuDeviceGetCount
0.00% 1.4430us 3 481ns 286ns 609ns cudaPeekAtLastError
0.00% 1.0270us 2 513ns 215ns 812ns cuDeviceGet
0.00% 487ns 1 487ns 487ns 487ns cuModuleGetLoadingMode
0.00% 343ns 1 343ns 343ns 343ns cuDeviceGetUuid
==3726== Unified Memory profiling result:
Device "Tesla T4 (0)"
Count Avg Size Min Size Max Size Total Size Total Time Name
9 1.7782MB 4.0000KB 2.0000MB 16.00391MB 1.398856ms Host To Device
1 4.0000KB 4.0000KB 4.0000KB 4.000000KB 1.888000us Device To Host
Total CPU Page faults: 50
performance with 1024 threads:
==5309== NVPROF is profiling process 5309, command: ./TestProg
==5309== Profiling application: ./TestProg
==5309== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 100.00% 146.62us 2 73.310us 72.447us 74.174us run(foo, int)
API calls: 97.35% 109.32ms 5 21.864ms 13.430us 109.23ms cudaMallocManaged
1.33% 1.4988ms 3 499.60us 73.074us 1.3485ms cudaDeviceSynchronize
0.91% 1.0164ms 5 203.27us 7.8520us 644.41us cudaMemPrefetchAsync
0.26% 287.86us 2 143.93us 10.699us 277.17us cudaLaunchKernel
0.12% 139.71us 114 1.2250us 144ns 55.623us cuDeviceGetAttribute
0.01% 11.217us 1 11.217us 11.217us 11.217us cuDeviceGetName
0.01% 8.3230us 2 4.1610us 1.7540us 6.5690us cudaGetDevice
0.00% 5.5570us 1 5.5570us 5.5570us 5.5570us cuDeviceGetPCIBusId
0.00% 4.8530us 1 4.8530us 4.8530us 4.8530us cuDeviceTotalMem
0.00% 1.7480us 3 582ns 360ns 719ns cudaPeekAtLastError
0.00% 1.7100us 3 570ns 197ns 1.1880us cuDeviceGetCount
0.00% 873ns 2 436ns 189ns 684ns cuDeviceGet
0.00% 580ns 1 580ns 580ns 580ns cuModuleGetLoadingMode
0.00% 195ns 1 195ns 195ns 195ns cuDeviceGetUuid
==5309== Unified Memory profiling result:
Device "Tesla T4 (0)"
Count Avg Size Min Size Max Size Total Size Total Time Name
9 1.7782MB 4.0000KB 2.0000MB 16.00391MB 1.392620ms Host To Device
1 4.0000KB 4.0000KB 4.0000KB 4.000000KB 1.888000us Device To Host
Total CPU Page faults: 50
Thanks again for the help!! I’m still learning all this haha, there’s a lot to learn :D