Hi,
I am new to nsight compute, and having a hard time understanding about SMs.
Here is a simple code I’m working on
#include <iostream>
#include <cuda_runtime.h>
#include <vector>
__global__ void matrixMultiplyKernel(const float *A, const float *B, float *C, int A_rows, int A_cols, int B_cols)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < A_rows && col < B_cols)
{
float sum = 0.0f;
for (int i = 0; i < A_cols; ++i)
{
sum += A[row * A_cols + i] * B[i * B_cols + col];
}
C[row * B_cols + col] = sum;
}
}
void matrixMultiply(const float *h_A1, const float *h_B1, float *h_C1, int A_rows, int A_cols, int B_cols)
{
size_t size_A = A_rows * A_cols * sizeof(float);
size_t size_B = A_cols * B_cols * sizeof(float);
size_t size_C = A_rows * B_cols * sizeof(float);
float *d_A1, *d_B1, *d_C1;
cudaMalloc((void **)&d_A1, size_A);
cudaMalloc((void **)&d_B1, size_B);
cudaMalloc((void **)&d_C1, size_C);
cudaMemcpyAsync(d_A1, h_A1, size_A, cudaMemcpyHostToDevice);
cudaMemcpyAsync(d_B1, h_B1, size_B, cudaMemcpyHostToDevice);
dim3 blockSize(4, 4);
dim3 gridSize((B_cols + blockSize.x - 1) / blockSize.x, (A_rows + blockSize.y - 1) / blockSize.y);
matrixMultiplyKernel<<<gridSize, blockSize>>>(d_A1, d_B1, d_C1, A_rows, A_cols, B_cols);
cudaMemcpyAsync(h_C1, d_C1, size_C, cudaMemcpyDeviceToHost);
cudaFree(d_A1);
cudaFree(d_B1);
cudaFree(d_C1);
}
int main()
{
int runtimeVersion = 0;
cudaRuntimeGetVersion(&runtimeVersion);
std::cout << "CUDA Runtime Version: " << runtimeVersion / 1000 << "." << (runtimeVersion % 1000) / 10 << std::endl;
int sm_count = 0;
cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, 0);
std::cout << "Total SMs: " << sm_count << std::endl;
const int A_rows = 4, A_cols = 3, B_rows = 3, B_cols = 4;
std::vector<float> h_A1(A_rows * A_cols, 1.0f);
std::vector<float> h_B1(A_cols * B_cols, 2.0f);
std::vector<float> h_C1(A_rows * B_cols, 0.0f);
matrixMultiply(h_A1.data(), h_B1.data(), h_C1.data(), A_rows, A_cols, B_cols);
return 0;
}
This code simply multiplies 2 matrix A B, each with the size of [4,3] and [3,4], resulting in a [4,4] sized matrix.
The gridsize is [1,1] and the blocksize is [4, 4] for the kernel.
Because of this blocksize, I expected that this will activate 1 warp, while there are only 4*4=16 threads in one block, and 1 warp can run 32 threads. However, when I runned this code with ncu using the matrix sm__warps_active, it looks like this:
sm__warps_active.avg [warp] 34.46
sm__warps_active.max [warp] 2826
sm__warps_active.min [warp] 0
sm__warps_active.sum [warp] 2826
So now I am very confused. Why isn’t this shown as 1, instead of 2826?
Thanks for your attention.