Performance doubts

Hi all! After doing a few programs in CUDA i started now obtaining their effective bandwidth. However i have some strange results, for example in the following code, where i can sum all the elements in a vector(regardless of dimension), the bandwidth with the Unroll Code and the “normal” code seems to have the same median result(around 3000 Gb/s)

I dont know if im doing something wrong(AFAIK the program works fine) but from what ive read so far, the Unroll code should have a higher bandwidth.

Thanks in advance

#include <stdio.h>

#include <limits.h>

#include <stdlib.h>

#include <math.h>

#define elements 1000

#define blocksize 16	

__global__ void vecsumkernel(float*input, float*output,int nelements){

 	

	__shared__ float psum[blocksize];

	int tid=threadIdx.x;

	if(tid + blockDim.x * blockIdx.x < nelements)

	psum[tid]=input[tid+blockDim.x*blockIdx.x];

	else

	psum[tid]=0.0f;

	__syncthreads();

	//WITHOUT UNROLL

	

	int stride; 	

	for(stride=blockDim.x/2;stride>0;stride>>=1){

		if(tid<stride)

			psum[tid]+=psum[tid+stride];

	__syncthreads();

	}

	if(tid==0)

		output[blockIdx.x]=psum[0];

	

	//WITH UNROLL

/*

	if(blocksize>=512 && tid<256) psum[tid]+=psum[tid+256];__syncthreads();

	if(blocksize>=256 && tid<128) psum[tid]+=psum[tid+128];__syncthreads();

	if(blocksize>=128 && tid<64) psum[tid]+=psum[tid+64];__syncthreads();

	

	if (tid < 32) {

   		if (blocksize >= 64) psum[tid] += psum[tid + 32];

	    	if (blocksize >= 32) psum[tid] += psum[tid + 16];

	    	if (blocksize >= 16) psum[tid] += psum[tid + 8];

	    	if (blocksize >=  8) psum[tid] += psum[tid + 4];

    		if (blocksize >=  4) psum[tid] += psum[tid + 2];

	    	if (blocksize >=  2) psum[tid] += psum[tid + 1];

	}*/

	if(tid==0)

		output[blockIdx.x]=psum[0];

	

}

void vecsumv2(float*input, float*output, int nelements){

	dim3 dimBlock(blocksize,1,1);

	int i;

	for(i=((int)ceil((double)(nelements)/(double)blocksize))*blocksize;i>1;i=(int)ceil((double)i/(double)blocksize)){

		dim3 dimGrid((int)ceil((double)i/(double)blocksize),1,1);

		printf("\ni=%d\ndimgrid=%u\n ",i,dimGrid.x);

		vecsumkernel<<<dimGrid,dimBlock>>>(i==((int)ceil((double)(nelements)/(double)blocksize))*blocksize ?input:output,output,i==((int)ceil((double)(nelements)/(double)blocksize))*blocksize ? elements:i);

	}

}

void printVec(float*vec,int dim){

	printf("\n{");

	for(int i=0;i<dim;i++)

		printf("%f ",vec[i]);

	printf("}\n");

}

int main(){

	cudaEvent_t evstart, evstop;

	cudaEventCreate(&evstart);

	cudaEventCreate(&evstop);

	float*input=(float*)malloc(sizeof(float)*(elements));

	for(int i=0;i<elements;i++)

		input[i]=(float) i;

	float*output=(float*)malloc(sizeof(float)*elements);

	float *input_d,*output_d;

	cudaMalloc((void**)&input_d,elements*sizeof(float));

	cudaMalloc((void**)&output_d,elements*sizeof(float));

	cudaMemcpy(input_d,input,elements*sizeof(float),cudaMemcpyHostToDevice);

	

	cudaEventRecord(evstart,0);

	vecsumv2(input_d,output_d,elements);

	cudaEventRecord(evstop,0);

	cudaEventSynchronize(evstop);

	float time;

	cudaEventElapsedTime(&time,evstart,evstop);

	printf("\ntempo gasto:%f\n",time);

	float Bandwidth=((1000*4*2)/10^9)/time;

	printf("\n Bandwidth:%f Gb/s\n",Bandwidth);

	cudaMemcpy(output,output_d,elements*sizeof(float),cudaMemcpyDeviceToHost);

	cudaFree(input_d);

	cudaFree(output_d);

	printf("soma do vector");

	printVec(output,4);

}