Measuring Effective Bandwidth

I use a Geforce 8800 gtx .For it I believe the theoretical “device”-to-“device”(?) bandwidth is 86.4 GB/s here.Now I want to know how to precisely measure the time to use in effective bandwidth calculation.I assume this is relating to device -device transfers so to calculate the time in my transfers should I just take the difference in calling the kernel with all my instructions versus calling a empty kernel (one with no instructions)?
Also for calculating my total number of bytes read and written for a kernel like :(downsamples a x into y image to x/2 into y/2)

global void streamkernel(int *r_d,int *g_d,int *b_d,int height ,int width,int *f_r,int *f_g,int *f_b){

int id=blockIdx.x * blockDim.x*blockDim.y+ threadIdx.y*blockDim.x+threadIdx.x+blockIdx.y*gridDim.x*blockDim.x*blockDim.y;
int number=2*(id%(width/2))+(id/(width/2))*width*2;
 
if (id<height*width/4){
  f_r[id]=(r_d[number]+r_d[number+1]+r_d[number+width]+r_d[number+width+1])/4; 
  f_g[id]=(g_d[number]+g_d[number+1]+g_d[number+width]+g_d[number+width+1])/4;    
  f_b[id]=(b_d[number]+b_d[number+1]+b_d[number+width]+b_d[number+width+1])/4;
}

}
used this should it be : xy3(for r,g,b)4(for int) and write = 1/4 xy3(for r,g,b)*4(for int)?

11.5 ms in 8800 gts so = 268435456/11.5*2 =46 Go/s

if fact when i read only 2 by thread i have 50 go/s

#include <stdio.h>  

 #include <cuda.h>  

 #include <time.h>

 #include <math.h>

 #include "cutil_inline.h"

  __global__ void square_array(float *a, int N,int in,int ss)  

 {  

__shared__ float s_data[2048];

	    int sh=  threadIdx.x+32*threadIdx.y;

	    int id = 512 * blockIdx.x +  524288 * blockIdx.y + sh;

	 int z ;

	 if ( id<N)

	{

	    z= id;

		s_data[sh] = a[z];

		s_data[sh+512] = a[z+512];

		s_data[sh+1024] = a[z+1024];

		s_data[sh+1024+512] = a[z+1024+512];

__syncthreads();

	    z= id;

		a[z]=s_data[sh];

		a[z+512]=s_data[sh+512];

		a[z+1024]=s_data[sh+1024];

		a[z+1024+512]=s_data[sh+1024+512];

	 }  

}

// main routine that executes on the host  

 int main(void)  

 {  

   float *memoirecpu1, *memoiregraphique1;  // Pointer to host & device arrays  

   cudaEvent_t start, stop;

   int S = 0;

   int N=1;

N=1;

S=26;

N=N<<S;

size_t size = N * sizeof(float);  

   memoirecpu1 = (float *)malloc(size);        // Allocate array on host  

cutilSafeCall( cudaEventCreate(&start) );

    cutilSafeCall( cudaEventCreate(&stop)  );

    unsigned int timer;

    cutilCheckError(  cutCreateTimer(&timer)  );

    cutilCheckError(  cutResetTimer(timer)    );

    cutilSafeCall( cudaThreadSynchronize() );

    float gpu_time = 0.0f;

//---------------------------

  cudaMalloc((void **) &memoiregraphique1, size);   // Allocate array on device  

  cudaMemcpy(memoiregraphique1, memoirecpu1, size, cudaMemcpyHostToDevice);  

  cutilCheckError( cutStartTimer(timer) );

  N=N/4;

    cudaEventRecord(start, 0);

    square_array <<< dim3(1024,32,1),dim3(64,8,1) >>> (memoiregraphique1, N,1,1);  

    cudaEventRecord(stop, 0);

    unsigned long int counter=0;

    while( cudaEventQuery(stop) == cudaErrorNotReady )

    {

        counter++;

    }

    cutilSafeCall( cudaEventElapsedTime(&gpu_time, start, stop) );        

    printf("time spent executing by the GPU: %.2f\n", gpu_time);

//------------------------------

}