My card is 2080ti, my cuda is 11.4, and I use nvcc -arch=sm_75 to compile .cu file. When I use the command “sudo /usr/local/cuda-11.4/bin/ncu --metrics smsp__sass_average_branch_targets_threads_uniform.pct,smsp__sass_thread_inst_executed_op_control_pred_on.sum ./a.out” to check branch efficiency, the result is as follows.
The kernel is also very simple, just as follows:
" global void MatAddKernel0(float* a, float* b, float* c, int mat_size) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < mat_size) { // boundary check
c[i] = a[i] + b[i];
}
}
"
I also tried on other kernels, the outputs would always be zeros. So, why dose that happen? I honestly appreciate any suggestions.
The complete test code are here. Interestingly, only the kernel1 gives out 100 percent branch efficiency, the others’ metric output are all zero. However, if I compile the .cu file with “-G” option, the branch efficiency metrics work good, kernel0 would give out 100 percent instead of 0.
" include include <cuda_runtime.h> include <cuda_runtime_api.h> include include “…/…/include/common.h” define BLOCK_DIM 32 define H 2000 define W 4000
global void DummyKernel0(float* a, float* b, float* c, int mat_size) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < mat_size) { // boundary check
c[i] = a[i] + b[i];
}
}
global void DummyKernel1(float* a, float* b, float* c, int mat_size) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int warp_id = i / 32;
if (i < mat_size) { // boundary check
switch (warp_id) {
case 0: c[i] = a[i] + b[i]; break;
case 1: c[i] = a[i] - b[i]; break;
case 2: c[i] = a[i] + b[i]; break;
case 3: c[i] = a[i] - b[i]; break;
c[i] = a[i] + b[i]; break;
}
}
}
global void DummyKernel2(float* a, float* b, float* c, int mat_size) { // nvcc would automatically optimize it
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < mat_size) {
if (i % 2 == 0) {
c[i] = a[i] + b[i];
} else {
c[i] = a[i] - b[i];
}
}
}
global void DummyKernel3(float* a, float* b, float* c, int mat_size) { // nvcc would not optimize it
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < mat_size) {
const bool cond = (i % 2 == 0);
if (cond) {
c[i] = a[i] + b[i];
}
if (!cond) {
c[i] = a[i] - b[i];
}
}
}
global void DummyKernel4(float* a, float* b, float* c, int mat_size) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int lane_id = i % 32;
if (i < mat_size) {
switch (lane_id) {
case 0: c[i] = a[i] + b[i]; break;
case 1: c[i] = a[i] - b[i]; break;
case 2: c[i] = a[i] + b[i]; break;
case 3: c[i] = a[i] - b[i]; break;
c[i] = a[i] + b[i]; break;
}
}
}
ProcessRecord MatAdd(float* h_mat_a, float* h_mat_b, float* h_mat_c, int mat_rows, int mat_cols, CtrlCmd cmd) {
// 1.
int mat_size = mat_rows * mat_cols;
float d_mat_a, d_mat_b, d_mat_c;
cudaMalloc((void)&d_mat_a, mat_size * sizeof(float));
cudaMalloc((void*)&d_mat_b, mat_size * sizeof(float));
cudaMalloc((void**)&d_mat_c, mat_size * sizeof(float));
// 2.
const auto t0 = std::chrono::steady_clock::now();
cudaMemcpy(d_mat_a, h_mat_a, mat_sizesizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_mat_b, h_mat_b, mat_sizesizeof(float), cudaMemcpyHostToDevice);
// 3.
const auto t1 = std::chrono::steady_clock::now();
int block_size = BLOCK_DIM;
int grid_size = std::ceil(static_cast(mat_rowsmat_cols) / block_size);
switch (cmd) {
case kUniformKernel1:
DummyKernel0<<<grid_size, block_size>>>(d_mat_a, d_mat_b, d_mat_c, mat_rowsmat_cols); break;
case kUniformKernel2:
DummyKernel1<<<grid_size, block_size>>>(d_mat_a, d_mat_b, d_mat_c, mat_rowsmat_cols); break;
case kDiverKernel1:
DummyKernel2<<<grid_size, block_size>>>(d_mat_a, d_mat_b, d_mat_c, mat_rowsmat_cols); break;
case kDiverKernel2:
DummyKernel3<<<grid_size, block_size>>>(d_mat_a, d_mat_b, d_mat_c, mat_rowsmat_cols); break;
case kDiverKernel3:
DummyKernel4<<<grid_size, block_size>>>(d_mat_a, d_mat_b, d_mat_c, mat_rowsmat_cols); break;
}
cudaDeviceSynchronize();
// 4.
const auto t2 = std::chrono::steady_clock::now();
cudaMemcpy(h_mat_c, d_mat_c, mat_size*sizeof(int), cudaMemcpyDeviceToHost);
// 5.
const auto t3 = std::chrono::steady_clock::now();
cudaFree(d_mat_a);
cudaFree(d_mat_b);
cudaFree(d_mat_c);
const auto t4 = std::chrono::steady_clock::now();
return ProcessRecord((t1-t0).count()*1e-6, (t2-t1).count()*1e-6, (t3-t2).count()*1e-6, (t4-t0).count()*1e-6);
}
int main(int argc, char* argv) {
// 0.
CtrlCmd cmd = kUniformKernel1;
int arg1 = std::atoi(argv[1]);
switch (arg1) {
case 0: cmd = kUniformKernel1; break;
case 1: cmd = kUniformKernel2; break;
case 2: cmd = kDiverKernel1; break;
case 3: cmd = kDiverKernel2; break;
case 4: cmd = kDiverKernel3; break;
}
// 1. Create pinned input data
float *h_mat_a, *h_mat_b, *h_mat_c;
cudaHostAlloc(&h_mat_a, H * W * sizeof(float), cudaHostAllocDefault);
cudaHostAlloc(&h_mat_b, H * W * sizeof(float), cudaHostAllocDefault);
cudaHostAlloc(&h_mat_c, H * W * sizeof(float), cudaHostAllocDefault);
SET_HOST_MAT_DATA(h_mat_a, H, W);
SET_HOST_MAT_DATA(h_mat_b, H, W);
// 2. Call cuda wrapper kernel function
WARM_UP(MatAdd(h_mat_a, h_mat_b, h_mat_c, H, W, cmd));
const auto record1 = MatAdd(h_mat_a, h_mat_b, h_mat_c, H, W, cmd);
// 3. Release memory
cudaFreeHost(h_mat_a);
cudaFreeHost(h_mat_b);
cudaFreeHost(h_mat_c);
std::cout << record1.kernel_exe << "ms" << std::endl;
return 0;
We can’t build successfully with your code. Seems the format is messed up. Can you provide the file for us to download and provide the build command line ? Thanks !