Hi,
The following code performs scaning. Can anybody tell me what further modifications can be done in the host code main(), without bothering about the improvements in the Kernel?
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <float.h>
// includes, project
//#include <cutil_inline.h>
#include<cuda.h>
#include<cuda_runtime.h>
#ifndef _SCAN_BEST_KERNEL_H_
#define _SCAN_BEST_KERNEL_H_
#define NUM_BANKS 16
#define LOG_NUM_BANKS 4
// Define this to more rigorously avoid bank conflicts, even at the lower (root) levels of the tree
//#define ZERO_BANK_CONFLICTS
#ifdef ZERO_BANK_CONFLICTS
#define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS + (index) >> (2 * LOG_NUM_BANKS))
#else
#define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS)
#endif
#ifdef CHECK_BANK_CONFLICTS
#define TEMP(index) cutilBankChecker(temp, index)
#else
#define TEMP(index) temp[index]
#endif
__global__ void scan_best(float *g_odata, float *g_idata, int n)
{
// Dynamically allocated shared memory for scan kernels
extern __shared__ float temp[];
int thid = threadIdx.x;
int ai = thid;
int bi = thid + (n/2);
// compute spacing to avoid bank conflicts
int bankOffsetA = CONFLICT_FREE_OFFSET(ai);
int bankOffsetB = CONFLICT_FREE_OFFSET(bi);
// Cache the computational window in shared memory
TEMP(ai + bankOffsetA) = g_idata[ai];
TEMP(bi + bankOffsetB) = g_idata[bi];
int offset = 1;
// build the sum in place up the tree
for (int d = n/2; d > 0; d >>= 1)
{
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
ai += CONFLICT_FREE_OFFSET(ai);
bi += CONFLICT_FREE_OFFSET(bi);
TEMP(bi) += TEMP(ai);
}
offset *= 2;
}
// scan back down the tree
// clear the last element
if (thid == 0)
{
int index = n - 1;
index += CONFLICT_FREE_OFFSET(index);
TEMP(index) = 0;
}
// traverse down the tree building the scan in place
for (int d = 1; d < n; d *= 2)
{
offset /= 2;
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
ai += CONFLICT_FREE_OFFSET(ai);
bi += CONFLICT_FREE_OFFSET(bi);
float t = TEMP(ai);
TEMP(ai) = TEMP(bi);
TEMP(bi) += t;
}
}
__syncthreads();
// write results to global memory
g_odata[ai] = TEMP(ai + bankOffsetA);
g_odata[bi] = TEMP(bi + bankOffsetB);
}
#endif // #ifndef _SCAN_BEST_KERNEL_H_
//WHAT IMPROVEMENT CAN BE DONE HERE ONWARD?
int main()
{
int n; //Number of data elements
cudaEvent_t start,stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
n=8;
size_t size=sizeof(int)*n;
float *g_odata, *g_idata, *m_odata, *m_idata;
m_idata=(float*)malloc(size);
m_odata=(float*)malloc(size);
cudaMalloc((void**)&g_idata,size);
cudaMalloc((void**)&g_odata,size);
// g-->global memory, m--> main memory
//allocate pointer in main memory
//allocate pointer in device memory
for(i=0; i<7; i++)
{
scanf("%f", &m_idata[i]);
}
cudaMemcpy(g_idata,m_idata,size,cudaMemcpyHostToDevice);
//copy the data m_idata from main memory into g_idata in device using cudaMemcpy
//Kernel Configuration: specifying the number of threads per block Nt and number of blocks per grid Nb
cudaEventRecord(start,0);
scan_best<<< 1, 10 >>>(g_odata, g_idata, n);//Call the kernel
cudaThreadSynchronize();
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime,start,stop);
printf("Processing time=%f(ms)\n",elapsedTime);
//copy the data g_odata from device global memory into m_odata in main memory using cudaMemcpy
cudaMemcpy(m_odata,g_odata,size,cudaMemcpyDeviceToHost);
// print m_odata
for(int i=0;i<n;i++)
{
printf("\n%f",m_odata[i]);
}
cudaFree(g_idata);
cudaFree(g_odata);
free(m_idata);
free(m_odata);
cudaEventDestroy(start);
cudaEventDestroy(stop);
}