Performance test sharedmemory <-> globalmemory

Hy

I try to do some performance tests. Therefore, i extended the project “Bandwidthtest” from the Nvidia Cuda examples. The results i attached. I expected other results. I thought, the bandwidth would increase with more data. :blink:

Now my questions:

  • Is my code a good way to measure bandwidth between shared memory andy global memory?

  • Can somebody imagine, why the curve is like that?

Thanks

Lanzelot

Here my code. It works without changing anything.

// includes, system

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

#include <stdlib.h>

//#include <iostream>

// includes, project

#include <cutil.h>

#include <cuda.h>

// defines Startwerte

#define MODE RANGE_MODE

#define START SHMOO_INCREMENT_1KB

#define END (16* (1<<10))

#define INCREMENT SHMOO_INCREMENT_1KB

#define FILENAME "ergebnis.csv"

#define PRINTCSV 1

// defines, project

#define MEMCOPY_ITERATIONS  10

#define DEFAULT_SIZE        ( 32 * ( 1 << 20 ) )    //32 M

#define DEFAULT_INCREMENT   (1 << 22)               //4 M

#define CACHE_CLEAR_SIZE    (1 << 24)               //16 M

//shmoo mode defines

#define SHMOO_MEMSIZE_MAX     (1 << 26)         //64 M

#define SHMOO_MEMSIZE_START   (1 << 10)         //1 KB

#define SHMOO_INCREMENT_1KB   (1 << 10)         //1 KB

#define SHMOO_INCREMENT_2KB   (1 << 11)         //2 KB

#define SHMOO_INCREMENT_10KB  (10 * (1 << 10))  //10KB

#define SHMOO_INCREMENT_100KB (100 * (1 << 10)) //100 KB

#define SHMOO_INCREMENT_1MB   (1 << 20)         //1 MB

#define SHMOO_INCREMENT_2MB   (1 << 21)         //2 MB

#define SHMOO_INCREMENT_4MB   (1 << 22)         //4 MB

#define SHMOO_LIMIT_20KB      (20 * (1 << 10))  //20 KB

#define SHMOO_LIMIT_50KB      (50 * (1 << 10))  //50 KB

#define SHMOO_LIMIT_100KB     (100 * (1 << 10)) //100 KB

#define SHMOO_LIMIT_1MB       (1 << 20)         //1 MB

#define SHMOO_LIMIT_16MB      (1 << 24)         //16 MB

#define SHMOO_LIMIT_32MB      (1 << 25)         //32 MB

//enums, project

enum memcpyKind { DEVICE_TO_HOST, HOST_TO_DEVICE, DEVICE_TO_DEVICE, DEVICE_TO_SHARED, SHARED_TO_DEVICE };

enum memoryMode { PINNED, PAGEABLE };

////////////////////////////////////////////////////////////////////////////////

// declaration, forward

float testDeviceToSharedTransfer(unsigned int memSize, memoryMode memMode);

float testSharedToDeviceTransfer(unsigned int memSize, memoryMode memMode);

////////////////////////////////////////////////////////////////////////////////

// Program main

////////////////////////////////////////////////////////////////////////////////

int

main( int argc, char** argv) 

{

	//test shared to global Memory, 1kB to 16 kB

	for(int i=SHMOO_INCREMENT_1KB;i<16*SHMOO_INCREMENT_1KB;i=i+SHMOO_INCREMENT_1KB){

  printf("%f\n",testSharedToDeviceTransfer(i, PAGEABLE));

	}

	printf("\n");

	//test global to shared Memory, 1kB to 16 kB

	for(int i=SHMOO_INCREMENT_1KB;i<16*SHMOO_INCREMENT_1KB;i=i+SHMOO_INCREMENT_1KB){

	printf("%f\n",testSharedToDeviceTransfer(i, PAGEABLE));

	}

}

/**

	Kernel für den device to shared Transfer

*/

__global__ void kernelDeviceToSharedTransfer(unsigned char* globalArray){

	

	//allocate shared memory

	extern __shared__ unsigned char sharedmemory[];

	unsigned char* shared = (unsigned char*) sharedmemory;

	

   shared[blockIdx.x*blockDim.x+threadIdx.x] = globalArray[blockIdx.x*blockDim.x+threadIdx.x];

}

/**

	Kernel für den shared to device Transfer

*/

__global__ void kernelSharedToDeviceTransfer( unsigned char* globalArrayKopie){

	

	//allocate shared memory

	extern __shared__ unsigned char sharedmemory[];

	unsigned char* shared = (unsigned char*) sharedmemory;

	

   globalArrayKopie[blockIdx.x*blockDim.x+threadIdx.x] = shared[blockIdx.x*blockDim.x+threadIdx.x];

}

///////////////////////////////////////////////////////////////////////////////

//  test the bandwidth of a device to shared memcopy of a specific size

///////////////////////////////////////////////////////////////////////////////

float

testDeviceToSharedTransfer(unsigned int memSize, memoryMode memMode)

{

    CUT_DEVICE_INIT();

   unsigned int timer = 0;

    float elapsedTimeInMs = 0.0f;

    float bandwidthInMBs = 0.0f;

    unsigned char *h_idata = NULL;

	unsigned char *h_idataKopie = NULL;

	unsigned char* hostArray = (unsigned char*)malloc(memSize);

   CUT_SAFE_CALL( cutCreateTimer( &timer ) );

     

	//allocate host memory

    h_idata = (unsigned char *)malloc( memSize );

	h_idataKopie = (unsigned char *)malloc( memSize );

   //initialize the memory

    for(unsigned int i = 0; i < memSize/sizeof(unsigned char); i++)

    {

        h_idata[i] = (unsigned char) (i & 0xff);

  h_idataKopie[i] = (unsigned char) (0x01);

    }

   // allocate device memory

    unsigned char* d_idata;

    CUDA_SAFE_CALL( cudaMalloc( (void**) &d_idata, memSize));

	unsigned char* d_idataKopie;

    CUDA_SAFE_CALL( cudaMalloc( (void**) &d_idataKopie, memSize));

   //initialize the device memory

    CUDA_SAFE_CALL( cudaMemcpy( d_idata, h_idata, memSize,

                                cudaMemcpyHostToDevice) );

	CUDA_SAFE_CALL( cudaMemcpy( d_idataKopie, h_idataKopie, memSize,

                                cudaMemcpyHostToDevice) );

	//start timer

	CUT_SAFE_CALL( cutStartTimer( timer));

   //copy data from GPU to shared

	kernelDeviceToSharedTransfer<<<16, memSize/16, (memSize)>>>(d_idata);

	//get the the total elapsed time in ms

    CUT_SAFE_CALL( cutStopTimer( timer));

    elapsedTimeInMs = cutGetTimerValue( timer);

	//Kopie vom device zum Host kopieren

	CUDA_SAFE_CALL( cudaMemcpy(hostArray, d_idataKopie, memSize, cudaMemcpyDeviceToHost));

    

    //calculate bandwidth in MB/s

    bandwidthInMBs = (1e3 * memSize * (float)MEMCOPY_ITERATIONS) / 

                                        (elapsedTimeInMs * (float)(1 << 20));

   //clean up memory

    CUT_SAFE_CALL( cutDeleteTimer( timer));

    if( PINNED == memMode )

    {

        CUDA_SAFE_CALL( cudaFreeHost(h_idata) );

  CUDA_SAFE_CALL( cudaFreeHost(h_idataKopie) );

    }

    else

    {

        free(h_idata);

    }

    CUDA_SAFE_CALL(cudaFree(d_idata));

    

    return bandwidthInMBs;

}

///////////////////////////////////////////////////////////////////////////////

//  test the bandwidth of a shared to device memcopy of a specific size

///////////////////////////////////////////////////////////////////////////////

float

testSharedToDeviceTransfer(unsigned int memSize, memoryMode memMode)

{

   CUT_DEVICE_INIT();

   unsigned int timer = 0;

	float elapsedTimeInMs = 0.0f;

    float bandwidthInMBs = 0.0f;

	unsigned char *h_idataKopieDtoStoD = NULL;

	unsigned char *hostArray = (unsigned char*)malloc(memSize);

   CUT_SAFE_CALL( cutCreateTimer( &timer ) );

	//allocate host memory

	h_idataKopieDtoStoD = (unsigned char *)malloc( memSize );

   //initialize the memory

    for(unsigned int i = 0; i < memSize/sizeof(unsigned char); i++)

    {

  h_idataKopieDtoStoD[i] = (unsigned char) (0x01);

    }

   // allocate device memory

    unsigned char* d_idata;

    CUDA_SAFE_CALL( cudaMalloc( (void**) &d_idata, memSize));

	unsigned char* d_idataKopieDtoStoD;

    CUDA_SAFE_CALL( cudaMalloc( (void**) &d_idataKopieDtoStoD, memSize));

   //initialize the device memory

	CUDA_SAFE_CALL( cudaMemcpy( d_idataKopieDtoStoD, h_idataKopieDtoStoD, memSize,

                            cudaMemcpyHostToDevice) );

	CUT_SAFE_CALL( cutStartTimer( timer));

	//copy data from GPU to shared to GPU

	kernelSharedToDeviceTransfer<<<16, memSize/16, (memSize)>>>(d_idataKopieDtoStoD);

	CUDA_SAFE_CALL( cudaThreadSynchronize() );

	//get the the total elapsed InitTime in ms

	elapsedTimeInMs = cutGetTimerValue( timer);

	//Kopie vom device zum Host kopieren

	CUDA_SAFE_CALL( cudaMemcpy(hostArray, d_idataKopieDtoStoD, memSize, cudaMemcpyDeviceToHost));

	//printf("%f\n",elapsedTimeInMs);

	bandwidthInMBs = ((1e3 * memSize * (float)MEMCOPY_ITERATIONS) / 

                                        (( elapsedTimeInMs) * (float)(1 << 20)));

   //clean up memory

    CUT_SAFE_CALL( cutDeleteTimer( timer));

    if( PINNED == memMode )

    {

  CUDA_SAFE_CALL( cudaFreeHost(h_idataKopieDtoStoD) );

    }

    else

    {

  free(h_idataKopieDtoStoD);

    }

    CUDA_SAFE_CALL(cudaFree(d_idata));

    

    return bandwidthInMBs;

}

forum.JPG

A couple things should be changed. I think the bizzare graph has to do with occupancy. First of all, you are only instantiating 16 thread blocks. Now, this may seem like a perfect match if your GPU has 16 multiprocessors. But! It looks like the kernels don’t know how much shared memory they’re going to use at compile time, which makes it difficult for the thread blocks to be scheduled efficiently. The important point here is that you can not assume that each block will be scheduled to a different multiprocessor. Indeed, from the graph, I would think that up until 8KB transferes, two blocks are running on 8 multiprocessors, while the other 8 multiprocessors are idle. Then when you go over 8KB of shared memory per block, you start taking a big performance hit because only one block can run on a multiprocessor at a time.

To measure the maximum global memory to shared memory, here’s what you should do:

  1. Make sure you are calling kernels in such a configuration as to ensure maximum occupancy. For example, if each block has 256 threads and takes 4Kb of shared memory, you will get maximum occupancy (with your kernels, at least). 3 threadblocks will run simultaneously on each multiprocessor, assuming you instantiate enough thread blocks.

  2. Make sure you instantiate enough thread blocks to fill the whole device. To make the device run at maximum efficiency, try 100+ thread blocks

  3. In the kernel, try making the shared arrays with a known number of elements at compile time. This would mean that you would have to re-run the program for each size, which sucks…but this way, the blocks will be able to be scheduled most efficiently.

  4. Your kernels are very short. Much of the measured run time will be in the overhead of calling the kernel. Your measured results will be far lower than the actual bandwidth. To fix this, modify your kernels to do a copy, sync threads, copy, sync threads, copy, sync threads, etc. (You can make a for loop inside the kernel to do this). This way, the overhead of actually invocing the kernel will become insignificant, and you will obtain the true bandwidth from shared to global memory.

First, establish a baseline. An additional test you might consider running before measuring performance of your kernel is to call cudaMemcpy( void*, void*, size_t, cudaMemcpyDeviceToDevice) on large blocks in global memory. This will presumably test global memory bandwidth and provide a reasonable level of performance to expect from your kernels.

Back to the problem at hand.

I agree with this statement in particular and wish to elaborate upon it.

When writing kernels, you may recall that integer multiplies are rather slow. You happen to be using them to compute the index of every element. I would recommend you compute base addresses with multiplication and then enter a loop that computes new indices by adding an offset at the end of each iteration.

The following is an example of a function template that achieves fairly high memory bandwidth (assuming T is 32-bit aligned). It reads every element in a buffer with potentially nonunit stride and writes them out to a buffer of some other stride. One could easily write to shared memory instead of the register variable ‘element’ should one feel so inclined. This achieves ~60 GB/s on my GeForce 8800 GTX.

template <typename T> 

void __global__ cuda_membandwidth_test(

	const int A_length, const int A_stride, const T *a,

	const int R_length, const int R_stride, T *r) {

	int a_offset = A_stride * (int)(blockDim.y * blockIdx.y);

	int r_offset = R_stride * (int)(blockDim.y * blockIdx.y);

	const int partial_delta = gridDim.y * blockDim.y;

	const int terminal_offset = A_length * A_stride;

	const int a_offset_delta = partial_delta * A_stride;

	const int r_offset_delta = partial_delta * R_stride;

	a_offset += threadIdx.y * A_stride;

	r_offset += threadIdx.y * R_stride;

	for (; a_offset < terminal_offset; 

  a_offset += a_offset_delta, r_offset += r_offset_delta) {

 T element = a[a_offset];

  // .. processing ..

  r[r_offset] = element;

	}

}

..

dim3 grid(1, 64), block(1, 256);

cuda_membandwidth_test<float><<< grid, block >>>(

  input_length, input_stride, input_vector,

  output_length, output_stride, output_vector);

  

Hope that helps.