Welcome everyone External Image to my second thread on memory bandwidth! (first one on cmem here)
smem := shared memory
cmem := constant memory
tmem := texture memory
gmem := global memory
MP := multiprocessor
(i’m using 9800 GT)
Theoretical Discussion:
The Programming Manual will tell us how to calculate global memory bandwidth, but what about ON-CHIP bandwidths? (i.e. smem, cmem cache, tmem cache, registers)
Global Bandwidth (OFF-CHIP) = (memory clock rate) * (data bus size) GB/s
ON-CHIP Bandwidth = ???
Now if we’re looking inside an MP, I’m guessing the 8 ALUs don’t have anything to do with mem transfer… I’ve got no idea how the data bus is configured… Does it still operate in half warps like a gmem transfer? how long does a half warp take to complete a data transfer, 1 clock cycle?
For a given MP, if we assume 1 clock cycle to transfer a half warp (16 threads), it would then take 16 integers/floats per clock cycle or 64 bytes / clk cycle
furthermore, suppose we have a clock rate of 1400 MHz then 1 clock cycle = 1/clock_rate seconds or 89.6 GB/s (but now i think i’m confusing shader clock w/ memory clock)
Measured Results:
after transferring ~5KB of data from smem to smem (see code below) I got 4.52 GB/s
or 5120 bytes / 1836 clk cycles = 2.8 bytes / clk cycle (not even an integer(4bytes) per clk cycle!)
(again clk cycle is referenced to the shader clock)
For comparison, I found that when transferring ~5kb of data from gmem to smem i got 1.36 GB/s (only ONE MP running 1 block of 128 threads)
or 0.88 bytes / clk cycle (code can be found in my first thread)
IS IT TRUE THAT YOU ONLY EXPERIENCE A SPEED UP OF 3x TIMES??? I thought ON-CHIP memory would be significantly faster… :blink:
thanks for the read! External Image
(below you can find smem to smem output & code)
Output:
[codebox]Your GPU clock rate = 1.620000 GHz
Smem test used 10000 test iterations
Total time of 11.333570 ms (18360384 clk cycles)
Average time of 1.133333 us (1836 clk cycles)
Transfered data = 5120 bytes
—> Bandwidth = 4.517647 GB/s <—
[/codebox]
Code:
[codebox]#include <stdio.h>
#include <cuda.h>
extern shared char smem;
global void mem3_kernel( int *data, int *output, int num_elements, int test_iterations )
{
int start, stop, i, s;
int tid = threadIdx.x;
int* smem1 = (int*)smem;
int* smem2 = (int*)&smem1[num_elements];
// INITIAL transfer gmem to smem
for ( s = 0 ; s < num_elements ; s += blockDim.x )
smem1[tid + s] = data[tid + s] ;
__syncthreads();
if ( tid == 0 )
start = clock();
for ( i = 0 ; i < test_iterations ; i ++ )
{
// transfer smem to smem
for ( s = 0 ; s < num_elements ; s += blockDim.x )
smem2[tid + s] = smem1[tid + s];
__syncthreads();
}
__syncthreads();
if (tid == 0 )
stop = clock();
// transfer smem to gmem
for ( s = 0 ; s < num_elements ; s += blockDim.x )
output[tid + s] = smem2[tid + s];
__syncthreads();
if ( tid == 0 )
output[0] = stop - start;
}
int main ()
{
int num_elements, data_size;
int grid_size, block_size;
int test_iterations, i;
int total_clk_cycles, ave_clk_cycles;
int *h_data, *d_data, *h_output, *d_output;
float bandwidth, gpu_clk_rate;
float total_time, ave_time;
// get GPU clk rate
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
gpu_clk_rate = deviceProp.clockRate * 1e-6;
printf("\nYour GPU clock rate = %f GHz\n", gpu_clk_rate);
grid_size = 1;
block_size = 128;
test_iterations = 10000;
num_elements = block_size*10; // 1280 ints (5.12kb)
data_size = num_elements*sizeof(int);
// allocate arrays
h_data = (int*)malloc(data_size);
h_output = (int*)malloc(data_size);
// allocate CUDA arrays
cudaMalloc((void **) &d_data, data_size);
cudaMalloc((void **) &d_output, data_size);
// fill data
for ( i = 0 ; i < num_elements ; i ++ )
h_data[i] = i+1;
// transfer data from cpu to gpu CONSTANT MEMORY
cudaMemcpy(d_data, h_data, data_size, cudaMemcpyHostToDevice);
// kernel invocation
mem3_kernel <<< grid_size, block_size, data_size*2.3 >>> ( d_data, d_output, num_elements, test_iterations );
// trasnfer output from gpu to cpu
cudaMemcpy(h_output, d_output, data_size, cudaMemcpyDeviceToHost);
// calculate bandwidth
total_clk_cycles = h_output[0];
ave_clk_cycles = total_clk_cycles / test_iterations;
total_time = total_clk_cycles / gpu_clk_rate / 1e9; // (seconds)
ave_time = ave_clk_cycles / gpu_clk_rate / 1e9; // (seconds)
bandwidth = data_size / ave_time; // (byte/second)
// display results
printf("\nSmem test used %d test iterations\n", test_iterations);
printf("Total time of %f ms (%d clk cycles)\n", (float)total_time*1e3, total_clk_cycles);
printf("Average time of %f us (%d clk cycles)\n", (float)ave_time*1e6, ave_clk_cycles);
printf("Transfered data = %d bytes\n", data_size);
printf("---> Bandwidth = %f GB/s <---\n\n", bandwidth*1e-9);
// free memory
free(h_data);
free(h_output);
cudaFree(d_data);
cudaFree(d_output);
}
[/codebox]