 # Measuring Effective Bandwidth

I use a Geforce 8800 gtx .For it I believe the theoretical “device”-to-“device”(?) bandwidth is 86.4 GB/s here.Now I want to know how to precisely measure the time to use in effective bandwidth calculation.I assume this is relating to device -device transfers so to calculate the time in my transfers should I just take the difference in calling the kernel with all my instructions versus calling a empty kernel (one with no instructions)?
Also for calculating my total number of bytes read and written for a kernel like :(downsamples a x into y image to x/2 into y/2)

global void streamkernel(int *r_d,int *g_d,int *b_d,int height ,int width,int *f_r,int *f_g,int *f_b){

``````int id=blockIdx.x * blockDim.x*blockDim.y+ threadIdx.y*blockDim.x+threadIdx.x+blockIdx.y*gridDim.x*blockDim.x*blockDim.y;
int number=2*(id%(width/2))+(id/(width/2))*width*2;

if (id<height*width/4){
f_r[id]=(r_d[number]+r_d[number+1]+r_d[number+width]+r_d[number+width+1])/4;
f_g[id]=(g_d[number]+g_d[number+1]+g_d[number+width]+g_d[number+width+1])/4;
f_b[id]=(b_d[number]+b_d[number+1]+b_d[number+width]+b_d[number+width+1])/4;
}
``````

}
used this should it be : xy3(for r,g,b)4(for int) and write = 1/4 xy3(for r,g,b)*4(for int)?

11.5 ms in 8800 gts so = 268435456/11.5*2 =46 Go/s

if fact when i read only 2 by thread i have 50 go/s

``````#include <stdio.h>

#include <cuda.h>

#include <time.h>

#include <math.h>

#include "cutil_inline.h"

__global__ void square_array(float *a, int N,int in,int ss)

{

__shared__ float s_data;

int id = 512 * blockIdx.x +  524288 * blockIdx.y + sh;

int z ;

if ( id<N)

{

z= id;

s_data[sh] = a[z];

s_data[sh+512] = a[z+512];

s_data[sh+1024] = a[z+1024];

s_data[sh+1024+512] = a[z+1024+512];

z= id;

a[z]=s_data[sh];

a[z+512]=s_data[sh+512];

a[z+1024]=s_data[sh+1024];

a[z+1024+512]=s_data[sh+1024+512];

}

}

// main routine that executes on the host

int main(void)

{

float *memoirecpu1, *memoiregraphique1;  // Pointer to host & device arrays

cudaEvent_t start, stop;

int S = 0;

int N=1;

N=1;

S=26;

N=N<<S;

size_t size = N * sizeof(float);

memoirecpu1 = (float *)malloc(size);        // Allocate array on host

cutilSafeCall( cudaEventCreate(&start) );

cutilSafeCall( cudaEventCreate(&stop)  );

unsigned int timer;

cutilCheckError(  cutCreateTimer(&timer)  );

cutilCheckError(  cutResetTimer(timer)    );

float gpu_time = 0.0f;

//---------------------------

cudaMalloc((void **) &memoiregraphique1, size);   // Allocate array on device

cudaMemcpy(memoiregraphique1, memoirecpu1, size, cudaMemcpyHostToDevice);

cutilCheckError( cutStartTimer(timer) );

N=N/4;

cudaEventRecord(start, 0);

square_array <<< dim3(1024,32,1),dim3(64,8,1) >>> (memoiregraphique1, N,1,1);

cudaEventRecord(stop, 0);

unsigned long int counter=0;

{

counter++;

}

cutilSafeCall( cudaEventElapsedTime(&gpu_time, start, stop) );

printf("time spent executing by the GPU: %.2f\n", gpu_time);

//------------------------------

}
``````