sum reduction cuda,reduction

I wrote a sum reduction program with device function because I want to know if it works well in this way. these are the codes

#include <stdlib.h>

#include <cutil_inline.h>

#include <cuda.h>

#include <cuda_runtime.h>

#include <math.h>

__device__ void product(int *pixel,int *poids,int *product)

{

  int i=blockIdx.x*blockDim.x+threadIdx.x;

  int n=16;

if(i<n)

  {

	product[i]=pixel[i]*poids[i];

  }

}

__device__ void sum1(int *product,int n,int *sum1)

{

  __shared__ int s_data[blockDim.x*sizeof(int)];

  int i=blockIdx.x*blockDim.x+threadIdx.x;

  int tid=threadIdx.x;

s_data[tid]=(i<n) ? (product[i]):0;

  __syncthreads();

for(int dist=blockDim.x/2;dist>0;dist/=2)

  {

	if(tid<dist)

	{

	  s_data[tid]+=s_data[tid+dist];

	}

	__syncthreads();

}

if(tid==0)

  {

	sum1[blockIdx.x]=s_data[0];

  }

}

__device__ int sum2(int *sum1,int num_blocks)

{

  __shared__ int s_data[num_blocks*sizeof(float)];

  int i=threadIdx.x;

  int tid=threadIdx.x;

  int *sum2=s_data;

// reading from global memory, writing to shared memory

  s_data[tid]=(i<num_blocks) ? (sum1[i]):0;

  __syncthreads();

// do reduction in shared memory

  for(int dist=num_blocks/2;dist>0;dist/=2)

  {

	if(tid<dist)

	{

	  s_data[tid]+=s_data[tid+dist];

	}

	__syncthreads();

}

if(tid==0)   *sum2=s_data[0];

  return *sum2;

}

__global__ void sum(int *pixel,int *poids,int *product,int *sum,int n)

{

  int sum1[n*sizeof(int)];

  int num_block=n/4+((n%4==0)?0:1);

 product(pixel, poids, product);

  sum1(product, n, sum1);

*sum=sum2(sum1,num_block);

}				 

#include <stdio.h>

#include <stdlib.h>

#include <cutil_inline.h>

#include <cuda.h>

#include <cuda_runtime.h>

#include <threadtest_kernel.cu>

int main()

{

int n=16;

  int i;

  int h_pixel[]={23,21,34,2,2,3,4,45,55,69,5,3,5,8,7,9};

  int h_poids[]={20,23,56,8,7,4,9,77,23,4,6,4,2,3,1,4};

  int *sum;

  int *h_sum;

int *product;

  int *h_product;

int num_blocks=n/4+((n%4==0)? 0:1);

  int block_size=4;

cudaMalloc((void**) &pixel,sizeof(int)*n);

  cudaMalloc((void**) &poids,sizeof(int)*n);

  cudaMalloc((void**) &sum,sizeof(int)*num_blocks);

  cudaMalloc((void**) &product,sizeof(int)*n);

h_sum=(int *)malloc(sizeof(int));

  h_product=(int *)malloc(sizeof(int)*n);

cudaMemcpy(pixel,h_pixel,sizeof(int)*n,cudaMemcpyHostToDevice);

  cudaMemcpy(poids,h_poids,sizeof(int)*n,cudaMemcpyHostToDevice);

  sum<<<num_blocks,block_size>>>(pixel,poids,product,sum,n);

cudaMemcpy(h_product,product,sizeof(int)*n,cudaMemcpyDeviceToHost);

  cudaMemcpy(h_sum,sum,sizeof(int),cudaMemcpyDeviceToHost);

for(i=0;i<n;i++)  printf("product=%d\n",h_product[i]);

  printf("sum=%d\n",h_sum[0]);

cudaFree(pixel);

  cudaFree(poids);

  cudaFree(product);

  cudaFree(sum);

  free(h_product);

  free(h_sum);

But when I compile it, it appears 3 errors on line in red. It says “error: expression must have (pointer-to-) function type”. I have no idea about it.

Thanks for your help!!!

You’ve declared sum as both an int* in main() and as a function/kernel. You can’t do that.