Bandwidth test: test memory bandwidth.
Especially important for PCIE capability. Different MB has different PCIE capability.
The CUDA adaptor performance is depend on the capability of PCIE. It could be the performance bottleneck.
On the following programming drills, the number of clock cycles necessary for computation and utilised memory bandwidth have to be reported.
(1) parallelization in the programs - using 256 threads
(2) improving the memory access modes
(3) testing the parallelization by using 512/1024
(4) utilizing BLOCKS in the computation
(5) utilizing shared memory
(6) improving the computation performance by using a Treesum algorithm
(7) resolving the memory band conflict issue, encountered in applying Treesum algorithm with the shared memory
[b]
My CUDA sample
[/b]
#include <cuda_runtime.h>
#include <malloc.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define BLOCK_NUM 32
#define DATA_SIZE 1048576
#define THREAD_NUM 256
#define THREAD_NUM1 512
#define THREAD_NUM2 1024
int data[DATA_SIZE];
bool InitCUDA()
{
int count;
cudaGetDeviceCount(&count);
if (count == 0)
{
fprintf(stderr, "There is no device.\n");
return false;
}
printf("There are %d Devices\n", count);
int i;
for (i = 0; i < count; i++)
{
cudaDeviceProp prop;
if (cudaGetDeviceProperties(&prop, i) == cudaSuccess)
{
if (prop.major >= 1)
{
break;
}
}
}
if(i == count)
{
printf(stderr, "There is no device supporting CUDA 1.x.\n");
return false;
}
cudaSetDevice(i);
return true;
}
void GenerateNumbers(int *number, int size)
{
for (int i = 0; i < size; i++)
{
srand(time(0));
number[i] = rand() % 10;
}
}
int main()
{
int iDeviceCount = 0;
cudaGetDeviceCount(&iDeviceCount);
//printf("Number of GPU: %d\n", iDeviceCount);
cudaDeviceProp sDeviceProp;
cudaGetDeviceProperties(&sDeviceProp, 0);
initCUDA(sDeviceProp);
GenerateNumbers(data, DATA_SIZE);
cpu_SumofSquares(data, DATA_SIZE);
int *gpudata, *result;
clock_t* time;
THREAD_NUM=0
cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
cudaMalloc((void**)&result, sizeof(int));
cudaMalloc((void**)&time, sizeof(clock_t));
cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);
sumOfSquares << <1, 1, 0 >> >(gpudata, result, time);
int sum1 = 0;
clock_t time_used;
cudaMemcpy(&sum1, result, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&time_used, time, sizeof(clock_t), cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);
cudaFree(time);
THREAD_NUM=256
cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
cudaMalloc((void**)&result, sizeof(int)* THREAD_NUM);
cudaMalloc((void**)&time, sizeof(clock_t));
cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);
int sum[THREAD_NUM];
cudaMemcpy(&sum, result, sizeof(int)* THREAD_NUM, cudaMemcpyDeviceToHost);
cudaMemcpy(&time_used, time, sizeof(clock_t), cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);
cudaFree(time);
int final_sum = 0;
for (int i = 0; i <THREAD_NUM; i++)
{
final_sum += sum[i];
}
printf("\n(1) Thread Numbers = 256");
printf("\n(GPU) Sum : %d", final_sum);
printf("\n(GPU) Time : %d ms, ", (time_used / sDeviceProp.clockRate));
s_sum[1] = (time_used / sDeviceProp.clockRate);
printf("\nTransfer rate : %.f MB/s", (double) / ((double)(time_used / sDeviceProp.clockRate) / (double)1000));
printf("\n(GPU) Time : %f ms", msecTotal1);
THREAD_NUM=256
cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
cudaMalloc((void**)&result, sizeof(int)* THREAD_NUM);
cudaMalloc((void**)&time, sizeof(clock_t));
cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);
cudaMemcpy(&sum, result, sizeof(int)* THREAD_NUM, cudaMemcpyDeviceToHost);
cudaMemcpy(&time_used, time, sizeof(clock_t), cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);
cudaFree(time);
final_sum = 0;
for (int i = 0; i <THREAD_NUM; i++)
{
final_sum += sum[i];
}
printf("\n(2) Thread Numbers = 256 memory access");
printf("\n(GPU) Sum : %d\n", final_sum);
printf("\n(GPU) Time : %d ms, ", (time_used / sDeviceProp.clockRate));
s_sum[2] = (time_used / sDeviceProp.clockRate);
printf("\nTransfer rate : %.f MB/s", (double) / ((double)(time_used / sDeviceProp.clockRate) / (double)1000));
printf("\n(GPU) Time : %f ms\n", msecTotal1);
THREAD_NUM=512 memory access
cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
cudaMalloc((void**)&result, sizeof(int)* THREAD_NUM1);
cudaMalloc((void**)&time, sizeof(clock_t));
cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);
int sum512[THREAD_NUM1];
//clock_t time_used;
cudaMemcpy(&sum512, result, sizeof(int)* THREAD_NUM1, cudaMemcpyDeviceToHost);
cudaMemcpy(&time_used, time, sizeof(clock_t), cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);
cudaFree(time);
final_sum = 0;
for (int i = 0; i <THREAD_NUM1; i++)
{
final_sum += sum512[i];
}
printf("\n(3-1) Thread Numbers = 512 memory access");
printf("\n(GPU) Sum : %d\n", final_sum);
printf("(\nGPU) Time : %d ms , ", (time_used / sDeviceProp.clockRate));
s_sum[3] = (time_used / sDeviceProp.clockRate);
printf("\nTransfer rate : %.2f MB/s \n", (double) / ((double)(time_used / sDeviceProp.clockRate) / (double)1000));
//printf("(GPU) Time : %f ms\n", msecTotal1);
THREAD_NUM=1024 memory access
cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
cudaMalloc((void**)&result, sizeof(int)* THREAD_NUM2);
cudaMalloc((void**)&time, sizeof(clock_t));
cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);
int sum1024[THREAD_NUM2];
cudaMemcpy(&sum1024, result, sizeof(int)* THREAD_NUM2, cudaMemcpyDeviceToHost);
cudaMemcpy(&time_used, time, sizeof(clock_t), cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);
cudaFree(time);
final_sum = 0;
for (int i = 0; i <THREAD_NUM2; i++)
{
final_sum += sum1024[i];
}
printf("\n(3-2) Thread Numbers = 1024 memory access");
printf("\n(GPU) Sum : %d\n", final_sum);
printf("(\nGPU) Time : %f ms , ", msecTotal1);
s_sum[4] = msecTotal1;
printf("\nTransfer rate: %.f MB/s\n", (double) / ((double)(msecTotal1) / (double)1000));
//printf("(GPU) Time : %d ms , ", (time_used / sDeviceProp.clockRate));
//printf("Transfer rate : %.f MB/s\n", (double) / ((double)(time_used / sDeviceProp.clockRate) / (double)1000));
//printf("(GPU) Time : %f ms\n", msecTotal1);
THREAD_NUM=256 BLOCK_NUM 32 memory access
cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
cudaMalloc((void**)&result, sizeof(int)* THREAD_NUM * BLOCK_NUM);
cudaMalloc((void**)&time, sizeof(clock_t)* BLOCK_NUM * 2);
cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);
int sum32_256[THREAD_NUM * BLOCK_NUM];
clock_t time_used32_256[BLOCK_NUM * 2];
cudaMemcpy(&sum32_256, result, sizeof(int)* THREAD_NUM * BLOCK_NUM, cudaMemcpyDeviceToHost);
cudaMemcpy(&time_used32_256, time, sizeof(clock_t)* BLOCK_NUM * 2, cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);
cudaFree(time);
int final_sum32_256 = 0;
for (int i = 0; i < THREAD_NUM * BLOCK_NUM; i++)
{
final_sum32_256 += sum32_256[i];
}
QueryPerformanceCounter(&timeEnd);
double elapsed = ((timeEnd.QuadPart - timeStart.QuadPart) / quadpart) * 1000;
clock_t min_start, max_end;
min_start = time_used32_256[0];
max_end = time_used32_256[BLOCK_NUM];
for (int i = 0; i < BLOCK_NUM; i++)
{
if (min_start > time_used32_256[i])
min_start = time_used32_256[i];
if (max_end < time_used32_256[i + BLOCK_NUM])
max_end = time_used32_256[i + BLOCK_NUM];
}
printf("\n(4) Block Numbers = 32 Threads Numbers = 256 memory access");
printf("\n(GPU) Sum : %d", final_sum32_256);
printf("\n(GPU) Time : %f ms, ", msecTotal1);
printf("\nTransfer rate : %.2f MB/s", (double) / ((double)(msecTotal1) / (double)1000));
printf("\n(CPU) Time : %f ms", elapsed);
printf("\n(TOTAL) Time : %f ms", elapsed + msecTotal1);
s_sum[5] = (elapsed + msecTotal1);
printf("\n%d", sDeviceProp.clockRate);
printf("\n(GPU) Time : %f ms", ((double)(max_end - min_start) / (double)sDeviceProp.clockRate));
printf("\ntime: %d",( max_end- min_start ));
THREAD_NUM=256 BLOCK_NUM=32 shared memory memory access
cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
cudaMalloc((void**)&result, sizeof(int)* THREAD_NUM * BLOCK_NUM);
cudaMalloc((void**)&time, sizeof(clock_t)* BLOCK_NUM * 2);
cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);
int sum_shared[BLOCK_NUM];
clock_t time_used_shared[BLOCK_NUM * 2];
cudaMemcpy(&sum_shared, result, sizeof(int)* BLOCK_NUM, cudaMemcpyDeviceToHost);
cudaMemcpy(&time_used_shared, time, sizeof(clock_t)* BLOCK_NUM * 2, cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);
cudaFree(time);
int final_sum_shared = 0;
for (int i = 0; i < BLOCK_NUM; i++)
{
final_sum_shared += sum_shared[i];
}
printf("\n(5) Block Numbers = 32 Thread Numbers = 256 shared memory, memory access");
printf("\n(GPU) Sum : %d", final_sum_shared);
printf("\n(GPU) Time : %f ms, ", msecTotal1);
s_sum[6] = msecTotal1;
printf("\nTransfer rate: %.f MB/s", (double) / ((double)(msecTotal1) / (double)1000));
THREAD_NUM=256 BLOCK_NUM=32 shared memory and treesum memory access
cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
cudaMalloc((void**)&result, sizeof(int)* THREAD_NUM * BLOCK_NUM);
cudaMalloc((void**)&time, sizeof(clock_t)* BLOCK_NUM * 2);
cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);
int sum_treesum[BLOCK_NUM];
clock_t time_used_treesum[BLOCK_NUM * 2];
cudaMemcpy(&sum_treesum, result, sizeof(int)* BLOCK_NUM, cudaMemcpyDeviceToHost);
cudaMemcpy(&time_used_treesum, time, sizeof(clock_t)* BLOCK_NUM * 2, cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);
cudaFree(time);
int final_sum_treesum = 0;
for (int i = 0; i < BLOCK_NUM; i++)
{
final_sum_treesum += sum_treesum[i];
}
printf("\n(6-1) Block Numbers = 32 Thread Numbers = 256 shared memory and treesum, memory access");
printf("\n(GPU) Sum : %d", final_sum_treesum);
printf("\n(GPU) Time : %f ms, ", msecTotal1);
s_sum[7] = msecTotal1;
printf("\nTransfer rate: %.f MB/s\n", (double) / ((double)(msecTotal1) / (double)1000));
THREAD_NUM=256 BLOCK_NUM=32 shared memory and treesum2 memory access
cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
cudaMalloc((void**)&result, sizeof(int)* THREAD_NUM * BLOCK_NUM);
cudaMalloc((void**)&time, sizeof(clock_t)* BLOCK_NUM * 2);
cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);
sumOfSquares_block32_thread256_shared_treesum2 << <BLOCK_NUM, THREAD_NUM, THREAD_NUM * sizeof(int) >> >(gpudata, result, time);
int sum_treesum2[BLOCK_NUM];
clock_t time_used_treesum2[BLOCK_NUM * 2];
cudaMemcpy(&sum_treesum2, result, sizeof(int)* BLOCK_NUM, cudaMemcpyDeviceToHost);
cudaMemcpy(&time_used_treesum2, time, sizeof(clock_t)* BLOCK_NUM * 2, cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);
cudaFree(time);
int final_sum_treesum2 = 0;
for (int i = 0; i < BLOCK_NUM; i++)
{
final_sum_treesum2 += sum_treesum2[i];
}
printf("\n(6-2) Block Numbers = 32 Thread Numbers = 256 shared memory and treesum2, memory access");
printf("\n(GPU) Sum : %d\n", final_sum_treesum2);
printf("\n(GPU) Time : %f ms , ", msecTotal1);
s_sum[8] = msecTotal1;
printf("\nTransfer rate: %.f MB/s\n", (double)((double)4 / ((double)(msecTotal1) / (double)1000)));
THREAD_NUM=256 BLOCK_NUM=32 shared memory and treesum3 memory access
cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
cudaMalloc((void**)&result, sizeof(int)* THREAD_NUM * BLOCK_NUM);
cudaMalloc((void**)&time, sizeof(clock_t)* BLOCK_NUM * 2);
cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);
cudaEventRecord(start1, NULL);
sumOfSquares_block32_thread256_shared_treesum3 << <BLOCK_NUM, THREAD_NUM, THREAD_NUM * sizeof(int) >> >(gpudata, result, time);
cudaEventRecord(stop1, NULL);
cudaEventSynchronize(stop1);
cudaEventElapsedTime(&msecTotal1, start1, stop1);
int sum_treesum3[BLOCK_NUM];
clock_t time_used_treesum3[BLOCK_NUM * 2];
cudaMemcpy(&sum_treesum3, result, sizeof(int)* BLOCK_NUM, cudaMemcpyDeviceToHost);
cudaMemcpy(&time_used_treesum3, time, sizeof(clock_t)* BLOCK_NUM * 2, cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);
cudaFree(time);
int final_sum_treesum3 = 0;
for (int i = 0; i < BLOCK_NUM; i++)
{
final_sum_treesum3 += sum_treesum3[i];
}
printf("\n(7) Block Numbers = 32 Thread Numbers = 256 shared memory and treesum3, memory access");
printf("\n(GPU) sum : %d\n", final_sum_treesum3);
printf("(GPU) Time : %f ms , ", msecTotal1);
s_sum[9] = msecTotal1;
printf("Transfer rate: %.f MB/s\n", (double)((double)4 / ((double)(msecTotal1) / (double)1000)));
for (int i = 0; i < 10; i++)
{
printf("%0.2f ,", s_sum[0]/s_sum[i]);
}
printf("\n ");
system("\n pause");
return 0;
}
__global__ static void sumOfSquares_1(int *num, int* result, clock_t* time)
{
const int tid = threadIdx.x;
const int size = DATA_SIZE / THREAD_NUM;
int sum = 0;
int i;
clock_t start;
if (tid == 0) start = clock();
for (i = tid * size; i < (tid + 1) * size; i++)
{
sum += num[i] * num[i];
}
result[tid] = sum;
if (tid == 0) *time = clock() - start;
}
__global__ static void sumOfSquares_2(int *num, int* result, clock_t* time)
{
const int tid = threadIdx.x;
//const int size = DATA_SIZE / THREAD_NUM;
int sum = 0;
int i;
clock_t start;
if (tid == 0) start = clock();
for (i = tid; i < DATA_SIZE; i += THREAD_NUM)
{
sum += num[i] * num[i];
}
result[tid] = sum;
if (tid == 0) *time = clock() - start;
}
__global__ static void sumOfSquares_512(int *num, int* result, clock_t* time)
{
const int tid = threadIdx.x;
//const int size = DATA_SIZE / THREAD_NUM;
int sum = 0;
int i;
clock_t start;
if (tid == 0) start = clock();
for (i = tid; i < DATA_SIZE; i += THREAD_NUM1)
{
sum += num[i] * num[i];
}
result[tid] = sum;
if (tid == 0) *time = clock() - start;
}
__global__ static void sumOfSquares_1024(int *num, int* result, clock_t* time)
{
const int tid = threadIdx.x;
//const int size = DATA_SIZE / THREAD_NUM;
int sum = 0;
int i;
clock_t start;
if (tid == 0) start = clock();
for (i = tid; i < DATA_SIZE; i += THREAD_NUM2)
{
sum += num[i] * num[i];
}
result[tid] = sum;
if (tid == 0) *time = clock() - start;
}
__global__ static void sumOfSquares_block32_thread256(int *num, int* result, clock_t* time)
{
const int tid = threadIdx.x;
const int bid = blockIdx.x;
int sum = 0;
int i;
if (tid == 0) time[bid] = clock();
if (time[bid] < 0) time[bid] = -time[bid];
for (i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM)
{
sum += num[i] * num[i];
}
result[bid * THREAD_NUM + tid] = sum;
if (tid == 0) time[bid + BLOCK_NUM] = clock();
if (time[bid + BLOCK_NUM] < 0) time[bid + BLOCK_NUM] = -time[bid + BLOCK_NUM];
}
__global__ static void sumOfSquares_block32_thread256_shared(int *num, int* result, clock_t* time)
{
extern __shared__ int shared[];
const int tid = threadIdx.x;
const int bid = blockIdx.x;
int i;
if (tid == 0) time[bid] = clock();
shared[tid] = 0;
for (i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM)
{
shared[tid] += num[i] * num[i];
}
__syncthreads();
if (tid == 0)
{
for (i = 1; i < THREAD_NUM; i++)
{
shared[0] += shared[i];
}
result[bid] = shared[0];
}
if (tid == 0) time[bid + BLOCK_NUM] = clock();
}
__global__ static void sumOfSquares_block32_thread256_shared_treesum(int *num, int* result, clock_t* time)
{
extern __shared__ int shared[];
const int tid = threadIdx.x;
const int bid = blockIdx.x;
int i;
int offset = 1, mask = 1;
if (tid == 0) time[bid] = clock();
shared[tid] = 0;
for (i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM) {
shared[tid] += num[i] * num[i];
}
__syncthreads();
while (offset < THREAD_NUM)
{
if ((tid & mask) == 0)
{
shared[tid] += shared[tid + offset];
}
offset += offset;
mask = offset + mask;
__syncthreads();
}
if (tid == 0)
{
result[bid] = shared[0];
time[bid + BLOCK_NUM] = clock();
}
}
offset = THREAD_NUM / 2;
while (offset > 0)
{
if (tid< offset)
{
shared[tid] += shared[tid + offset];
}
offset >>= 1;
__syncthreads();
}
if (tid < 128)
{
shared[tid] += shared[tid + 128];
}
__syncthreads();
if (tid < 64)
{
shared[tid] += shared[tid + 64];
}
__syncthreads();
if (tid < 32)
{
shared[tid] += shared[tid + 32];
}
__syncthreads();
if (tid < 16)
{
shared[tid] += shared[tid + 16];
}
__syncthreads();
if (tid < 8)
{
shared[tid] += shared[tid + 8];
}
__syncthreads();
if (tid < 4)
{
shared[tid] += shared[tid + 4];
}
__syncthreads();
if (tid < 2)
{
shared[tid] += shared[tid + 2];
}
__syncthreads();
if (tid < 1)
{
shared[tid] += shared[tid + 1];
}
__syncthreads();
if (tid == 0)
{
result[bid] = shared[0];
time[bid + BLOCK_NUM] = clock();
}
Some problems over here but not sure how to modify it