I wrote a sum reduction program with device function because I want to know if it works well in this way. these are the codes
#include <stdlib.h>
#include <cutil_inline.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <math.h>
__device__ void product(int *pixel,int *poids,int *product)
{
int i=blockIdx.x*blockDim.x+threadIdx.x;
int n=16;
if(i<n)
{
product[i]=pixel[i]*poids[i];
}
}
__device__ void sum1(int *product,int n,int *sum1)
{
__shared__ int s_data[blockDim.x*sizeof(int)];
int i=blockIdx.x*blockDim.x+threadIdx.x;
int tid=threadIdx.x;
s_data[tid]=(i<n) ? (product[i]):0;
__syncthreads();
for(int dist=blockDim.x/2;dist>0;dist/=2)
{
if(tid<dist)
{
s_data[tid]+=s_data[tid+dist];
}
__syncthreads();
}
if(tid==0)
{
sum1[blockIdx.x]=s_data[0];
}
}
__device__ int sum2(int *sum1,int num_blocks)
{
__shared__ int s_data[num_blocks*sizeof(float)];
int i=threadIdx.x;
int tid=threadIdx.x;
int *sum2=s_data;
// reading from global memory, writing to shared memory
s_data[tid]=(i<num_blocks) ? (sum1[i]):0;
__syncthreads();
// do reduction in shared memory
for(int dist=num_blocks/2;dist>0;dist/=2)
{
if(tid<dist)
{
s_data[tid]+=s_data[tid+dist];
}
__syncthreads();
}
if(tid==0) *sum2=s_data[0];
return *sum2;
}
__global__ void sum(int *pixel,int *poids,int *product,int *sum,int n)
{
int sum1[n*sizeof(int)];
int num_block=n/4+((n%4==0)?0:1);
product(pixel, poids, product);
sum1(product, n, sum1);
*sum=sum2(sum1,num_block);
}
#include <stdio.h>
#include <stdlib.h>
#include <cutil_inline.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <threadtest_kernel.cu>
int main()
{
int n=16;
int i;
int h_pixel[]={23,21,34,2,2,3,4,45,55,69,5,3,5,8,7,9};
int h_poids[]={20,23,56,8,7,4,9,77,23,4,6,4,2,3,1,4};
int *sum;
int *h_sum;
int *product;
int *h_product;
int num_blocks=n/4+((n%4==0)? 0:1);
int block_size=4;
cudaMalloc((void**) &pixel,sizeof(int)*n);
cudaMalloc((void**) &poids,sizeof(int)*n);
cudaMalloc((void**) &sum,sizeof(int)*num_blocks);
cudaMalloc((void**) &product,sizeof(int)*n);
h_sum=(int *)malloc(sizeof(int));
h_product=(int *)malloc(sizeof(int)*n);
cudaMemcpy(pixel,h_pixel,sizeof(int)*n,cudaMemcpyHostToDevice);
cudaMemcpy(poids,h_poids,sizeof(int)*n,cudaMemcpyHostToDevice);
sum<<<num_blocks,block_size>>>(pixel,poids,product,sum,n);
cudaMemcpy(h_product,product,sizeof(int)*n,cudaMemcpyDeviceToHost);
cudaMemcpy(h_sum,sum,sizeof(int),cudaMemcpyDeviceToHost);
for(i=0;i<n;i++) printf("product=%d\n",h_product[i]);
printf("sum=%d\n",h_sum[0]);
cudaFree(pixel);
cudaFree(poids);
cudaFree(product);
cudaFree(sum);
free(h_product);
free(h_sum);
But when I compile it, it appears 3 errors on line in red. It says “error: expression must have (pointer-to-) function type”. I have no idea about it.
Thanks for your help!!!