Hi
I have a simple Vector Add kernel, adding two vectors of length 1 million (10241024) each.
My kernel configuration is 10241024/256 blocks, each with 256 threads. I have a Geforce 1050 Ti, running Visual Studio 2015 and Nsight 2019.3 on Win 10.
Using Nsight, I can achieve good SM activity, but poor issue efficiency. My eligible warps are about 1.42 whereas I should get at least three times higher. I tried using “launch_bound(256, 4)” and a few other numbers of minThreadsperBlock argument, but I got no improvements.
I have attached my code and screenshots.
Any help is appreciated.
Thanks
My code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#include <conio.h>
#define N 1024*1024
#define THREADS_PER_BLOCK 256
__global__ void __launch_bounds__(THREADS_PER_BLOCK, 4)addKernel(int *c, const int *a, const int *b)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < N)
c[i] = a[i] + b[i];
}
int main()
{
int *a, *b, *c;
int *d_a, *d_b, *d_c;
int size = N * sizeof(int);
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);
//cudaMalloc((void **)&d_d, size);
a = (int *)malloc(size);
b = (int *)malloc(size);
c = (int *)malloc(size);
for (int i = 0; i < N; i++) {
a[i] = i;
b[i] = i + 1;
}
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
addKernel << <(N / THREADS_PER_BLOCK), THREADS_PER_BLOCK >> >(d_c, d_a, d_b);
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
for (int i = N - 10; i < N; i++) {
printf("a[%d]: %d \tb[%d]: %d \tc[%d]: %d\n", i, a[i], i, b[i], i, c[i]);
}
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaDeviceReset();
getch();
return 0;
}