Need Help on Issue Efficiency

Hi
I have a simple Vector Add kernel, adding two vectors of length 1 million (10241024) each.
My kernel configuration is 1024
1024/256 blocks, each with 256 threads. I have a Geforce 1050 Ti, running Visual Studio 2015 and Nsight 2019.3 on Win 10.
Using Nsight, I can achieve good SM activity, but poor issue efficiency. My eligible warps are about 1.42 whereas I should get at least three times higher. I tried using “launch_bound(256, 4)” and a few other numbers of minThreadsperBlock argument, but I got no improvements.
I have attached my code and screenshots.
Any help is appreciated.
Thanks

My code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#include <conio.h>
#define N 1024*1024
#define THREADS_PER_BLOCK 256

__global__ void __launch_bounds__(THREADS_PER_BLOCK, 4)addKernel(int *c, const int *a, const int *b)
{
	int i = threadIdx.x + blockIdx.x * blockDim.x;
	if (i < N)
		c[i] = a[i] + b[i];
}

int main()
{
	int *a, *b, *c;
	int *d_a, *d_b, *d_c;
	int size = N * sizeof(int);

	cudaMalloc((void **)&d_a, size);
	cudaMalloc((void **)&d_b, size);
	cudaMalloc((void **)&d_c, size);
	//cudaMalloc((void **)&d_d, size);

	a = (int *)malloc(size);
	b = (int *)malloc(size);
	c = (int *)malloc(size);
	
	for (int i = 0; i < N; i++) {
		a[i] = i;
		b[i] = i + 1;
	}
	cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
	cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
	
	addKernel << <(N / THREADS_PER_BLOCK), THREADS_PER_BLOCK >> >(d_c, d_a, d_b);
	
	cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
	for (int i = N - 10; i < N; i++) {
		printf("a[%d]: %d \tb[%d]: %d \tc[%d]: %d\n", i, a[i], i, b[i], i, c[i]);
	}
	
	cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
	// cudaDeviceReset must be called before exiting in order for profiling and
	// tracing tools such as Nsight and Visual Profiler to show complete traces.
	cudaDeviceReset();
	getch();

	return 0;
}