Hi everyone,
I have a GeForce RTX 2050 GPU (cc 8.6) and I’m testing the execution time of a kernel on this GPU (System: Windows 11, CUDA 12.4). I know that, in order to get the maximum occupancy, you must consider the amount of shared memory, registers, resident blocks/SM, and resident threads/SM for the target GPU and set a proper grid and block size for it. However, the performance I see differs from what I understand from the theory.
These are the specs of my GPU:
Number of SMs: 16
Max number of resident blocks/SM: 16
Max number of resident threads/SM: 1536
I don’t mention shared mem and registers limitations because my kernel uses no shared mem and not so many registers (I’ll put the code at the end)
I have two questions:
-
I thought that the best and most optimum launch params for my kernel was <<<32, 768>>> (because it assigns 2 blocks to each SM, uses all SMs, and the maximum number of resident threads). However, when I run my kernel with <<<16, 1024>>>, it takes less time to execute. I can’t explain this behavior. What is the reason?
-
In order to use the full capacity of all SMs, I can use these sizes: <<<32, 768>>>, <<<64, 384>>>, …, <<<256, 96>>>. What is the difference between these in theory? When I launch my kernel with these parameters, I see performance degradation as the number of threads decreases and the number of blocks is increased. Why is that?
This is the code snippet (I read the filter coefficients and input signal from file; I omitted that part of the code):
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <vector>
#include "cuComplex.h"
__constant__ float d_basebandFilterCoeffs[1024];
__global__ void baseBandFilter(cuComplex* output, cuComplex* input,
const int filterLen, const int inputLen)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
cuComplex sum;
while (i < inputLen)
{
sum = {};
for (int j{}; j < filterLen; ++j)
{
sum.x += d_basebandFilterCoeffs[j] * input[i - j + filterLen - 1].x;
sum.y += d_basebandFilterCoeffs[j] * input[i - j + filterLen - 1].y;
}
output[i] = sum;
i += (blockDim.x * gridDim.x);
}
}
#define gpuErrchk() { gpuAssert(__FILE__, __LINE__); }
inline void gpuAssert(const char* file, int line, bool abort = true)
{
cudaDeviceSynchronize();
cudaError_t code = cudaGetLastError();
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
int main()
{
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
return 1;
}
const int inputLen = 2 * 1024 * 1024;
int filterLen = 513;
std::vector<float> filterCoeffs(filterLen);
std::vector<cuComplex> input(inputLen);
std::vector<cuComplex> output(inputLen);
cuComplex* d_input;
cuComplex* d_output;
cudaMalloc(&d_input, (inputLen + (filterLen - 1)) * sizeof(cuComplex));
cudaMalloc(&d_output, inputLen * sizeof(cuComplex));
gpuErrchk();
cudaMemcpy(d_input + filterLen - 1, input.data(), inputLen * sizeof(cuComplex), cudaMemcpyHostToDevice);
cudaMemset(d_input, 0.0, (filterLen - 1) * sizeof(cuComplex));
cudaMemcpyToSymbol(d_basebandFilterCoeffs, filterCoeffs.data(), filterLen * sizeof(float));
gpuErrchk();
cudaEvent_t _start, _stop;
cudaEventCreate(&_start);
cudaEventCreate(&_stop);
cudaEventRecord(_start, 0);
//change block and grid size for different tests
baseBandFilter << <16, 1024 >> > (d_output, d_input, filterLen, inputLen);
cudaEventRecord(_stop, 0);
cudaEventSynchronize(_stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, _start, _stop);
std::cout << "Elapsed time: " << elapsedTime << std::endl;
gpuErrchk();
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}