Hi, Below is my code to compute the reduce function. I am using Cuda 11.3, Visual studio 2019. I am gettting undefined reference to __syncthreads() function.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "add_kernel.cuh"
#include <stdio.h>
#define SIZE 64
#define SHMEM_SIZE 64*4
__global__ void reduce(int* a, int size)
{
__shared__ int partial_sum[SHMEM_SIZE];
// Calculate thread ID
int tid = blockIdx.x * blockDim.x + threadIdx.x;
// Load elements into shared memory
partial_sum[threadIdx.x] = a[tid];
__syncthreads();
// Start at 1/2 block stride and divide by two each iteration
for (int s = blockDim.x / 2; s > 0; s >>= 1) {
// Each thread does work unless it is further than the stride
if (threadIdx.x < s) {
partial_sum[threadIdx.x] += partial_sum[threadIdx.x + s];
}
__syncthreads();
}
// Let the thread 0 for this block write it's result to main memory
// Result is inexed by this block
if (threadIdx.x == 0) {
a[blockIdx.x] = partial_sum[0];
}
}
cudaError_t sum_reduce(int* a, unsigned int N)
{
cudaError_t cudaStatus;
int *dev_a;
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_a, N * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
// TB Size
int TB_SIZE = SIZE;
// Grid Size (cut in half) (No padding)
int GRID_SIZE = N / SIZE;
reduce<<<GRID_SIZE, TB_SIZE>>>(dev_a, N);
cudaMemcpy(a, dev_a, GRID_SIZE * sizeof(int), cudaMemcpyDeviceToHost);
float avg = 0.0;
for (int i_l = 0; i_l < GRID_SIZE; i_l++)
{
avg = avg + a[i_l];
}
avg = (float)(avg/(float)(N));
Error:
cudaFree(dev_a);
return cudaStatus;
}
If I use below lines in the code I can able to build without any errors. Is it right method to resolve this issue? If not can anyone suggest the right method?
#pragma once
#ifdef __INTELLISENSE__
void __syncthreads();
#endif