Undefined reference to __syncthreads()

Hi, Below is my code to compute the reduce function. I am using Cuda 11.3, Visual studio 2019. I am gettting undefined reference to __syncthreads() function.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include "add_kernel.cuh"
#include <stdio.h>


#define SIZE 64
#define SHMEM_SIZE 64*4
__global__ void reduce(int* a, int size)
{
   __shared__ int partial_sum[SHMEM_SIZE];

   // Calculate thread ID
   int tid = blockIdx.x * blockDim.x + threadIdx.x;

   // Load elements into shared memory
   partial_sum[threadIdx.x] = a[tid];
   __syncthreads();

   // Start at 1/2 block stride and divide by two each iteration
   for (int s = blockDim.x / 2; s > 0; s >>= 1) {
      // Each thread does work unless it is further than the stride
      if (threadIdx.x < s) {
         partial_sum[threadIdx.x] += partial_sum[threadIdx.x + s];
      }
      __syncthreads();
   }

   // Let the thread 0 for this block write it's result to main memory
   // Result is inexed by this block
   if (threadIdx.x == 0) {
      a[blockIdx.x] = partial_sum[0];
   }

}

cudaError_t sum_reduce(int* a, unsigned int N)
{
   cudaError_t cudaStatus;
   int *dev_a;
   cudaStatus = cudaSetDevice(0);
   if (cudaStatus != cudaSuccess) {
      fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
      goto Error;
   }

   // Allocate GPU buffers for three vectors (two input, one output)    .
   cudaStatus = cudaMalloc((void**)&dev_a, N * sizeof(int));
   if (cudaStatus != cudaSuccess) {
      fprintf(stderr, "cudaMalloc failed!");
      goto Error;
   }
   cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);

   // TB Size
   int TB_SIZE = SIZE;

   // Grid Size (cut in half) (No padding)
   int GRID_SIZE = N / SIZE;
   reduce<<<GRID_SIZE, TB_SIZE>>>(dev_a, N);

   cudaMemcpy(a, dev_a, GRID_SIZE * sizeof(int), cudaMemcpyDeviceToHost);

   float avg = 0.0;
   for (int i_l = 0; i_l < GRID_SIZE; i_l++)
   {
      avg = avg + a[i_l];
   }
   avg = (float)(avg/(float)(N));
Error:
   cudaFree(dev_a);
   return cudaStatus;
}

If I use below lines in the code I can able to build without any errors. Is it right method to resolve this issue? If not can anyone suggest the right method?

#pragma once
#ifdef __INTELLISENSE__
void __syncthreads();
#endif