How to modify the Cuda output

[b]

Bandwidth test: test memory bandwidth.
Especially important for PCIE capability. Different MB has different PCIE capability.
The CUDA adaptor performance is depend on the capability of PCIE. It could be the performance bottleneck.

On the following programming drills, the number of clock cycles necessary for computation and utilised memory bandwidth have to be reported.

(1) parallelization in the programs - using 256 threads

(2) improving the memory access modes

(3) testing the parallelization by using 512/1024

(4) utilizing BLOCKS in the computation

(5) utilizing shared memory

(6) improving the computation performance by using a Treesum algorithm

(7) resolving the memory bank conflict issue, encountered in applying Treesum algorithm with the shared memory
[/b]

My Cuda code below can run but not sure how to modify the output -

#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <windows.h>

void initCUDA(const cudaDeviceProp sDevProp);
void GenerateNumbers(int *number, int size);
void cpu_SumofSquares(int *number, int size);

__global__ static void sumOfSquares(int *num, int* result, clock_t* time);
__global__ static void sumOfSquares_1(int *num, int* result, clock_t* time);
__global__ static void sumOfSquares_2(int *num, int* result, clock_t* time);
__global__ static void sumOfSquares_512(int *num, int* result, clock_t* time);
__global__ static void sumOfSquares_1024(int *num, int* result, clock_t* time);
__global__ static void sumOfSquares_block32_thread256(int *num, int* result, clock_t* time);
__global__ static void sumOfSquares_block32_thread256_shared(int *num, int* result, clock_t* time);
__global__ static void sumOfSquares_block32_thread256_shared_treesum(int *num, int* result, clock_t* time);
__global__ static void sumOfSquares_block32_thread256_shared_treesum2(int *num, int* result, clock_t* time);
__global__ static void sumOfSquares_block32_thread256_shared_treesum3(int *num, int* result, clock_t* time);

#define DATA_SIZE 1048576
#define THREAD_NUM 256
#define THREAD_NUM1 512
#define THREAD_NUM2 1024
#define BLOCK_NUM 32
int data[DATA_SIZE];

int main()
{
	int iDeviceCount = 0;
	cudaGetDeviceCount(&iDeviceCount);
	cudaDeviceProp  sDeviceProp;
	cudaGetDeviceProperties(&sDeviceProp, 0);
	GenerateNumbers(data, DATA_SIZE);

	LARGE_INTEGER timeStart;
	LARGE_INTEGER timeEnd;
	LARGE_INTEGER frequency;
	double s_sum[9];

	int *gpudata, *result;
	clock_t* time;

	cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
	cudaMalloc((void**)&result, sizeof(int));
	cudaMalloc((void**)&time, sizeof(clock_t));
	cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);

	cudaEvent_t start1;
	cudaEvent_t stop1;
	cudaEventCreate(&start1);
	cudaEventCreate(&stop1);
	cudaEventRecord(start1, NULL);

	sumOfSquares << <1, 1, 0 >> >(gpudata, result, time);

	cudaEventRecord(stop1, NULL);
	cudaEventSynchronize(stop1);
	float msecTotal1 = 0.0f;
	cudaEventElapsedTime(&msecTotal1, start1, stop1);

	int sum1 = 0;
	clock_t time_used;
	cudaMemcpy(&sum1, result, sizeof(int), cudaMemcpyDeviceToHost);
	cudaMemcpy(&time_used, time, sizeof(clock_t), cudaMemcpyDeviceToHost);

	printf("THREAD_NUM=1");
	printf("\n(GPU) sum  : %d\n", sum1);
	printf("(GPU) Time : %d ms ,  ", (time_used / sDeviceProp.clockRate));
	s_sum[0] = (time_used / sDeviceProp.clockRate);
	printf("Transfer rate: %.2f MB/s \n", (double)4 / ((double)(time_used / sDeviceProp.clockRate) / (double)1000));

	cudaFree(gpudata);
	cudaFree(result);
	cudaFree(time);

	cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
	cudaMalloc((void**)&result, sizeof(int)* THREAD_NUM);
	cudaMalloc((void**)&time, sizeof(clock_t));
	cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);

	cudaEventRecord(start1, NULL);
	sumOfSquares_1 << <1, THREAD_NUM, 0 >> >(gpudata, result, time);
	cudaEventRecord(stop1, NULL);
	cudaEventSynchronize(stop1);
	cudaEventElapsedTime(&msecTotal1, start1, stop1);

	int sum[THREAD_NUM];
	cudaMemcpy(&sum, result, sizeof(int)* THREAD_NUM, cudaMemcpyDeviceToHost);
	cudaMemcpy(&time_used, time, sizeof(clock_t), cudaMemcpyDeviceToHost);

	cudaFree(gpudata);
	cudaFree(result);
	cudaFree(time);

	int final_sum = 0;
	for (int i = 0; i <THREAD_NUM; i++)
	{
	final_sum += sum[i];
	}

	printf("\n(1) Thread Numbers = 256");
	printf("\n(GPU) Sum : %d", final_sum);
	printf("\n(GPU) Time : %d ms,  ", (time_used / sDeviceProp.clockRate));
	s_sum[1] = (time_used / sDeviceProp.clockRate);
	printf("\nTransfer rate : %.f MB/s", (double)4 / ((double)(time_used / sDeviceProp.clockRate) / (double)1000));

	cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
	cudaMalloc((void**)&result, sizeof(int)* THREAD_NUM);
	cudaMalloc((void**)&time, sizeof(clock_t));
	cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);

	cudaEventRecord(start1, NULL);
	sumOfSquares_2 << <1, THREAD_NUM, 0 >> >(gpudata, result, time);
	cudaEventRecord(stop1, NULL);
	cudaEventSynchronize(stop1);
	cudaEventElapsedTime(&msecTotal1, start1, stop1);

	cudaMemcpy(&sum, result, sizeof(int)* THREAD_NUM, cudaMemcpyDeviceToHost);
	cudaMemcpy(&time_used, time, sizeof(clock_t), cudaMemcpyDeviceToHost);

	cudaFree(gpudata);
	cudaFree(result);
	cudaFree(time);

	final_sum = 0;
	for (int i = 0; i <THREAD_NUM; i++)
	{
	final_sum += sum[i];
	}

	printf("\n(2) Thread Numbers = 256 memory access");
	printf("\n(GPU) Sum : %d\n", final_sum);
	printf("\n(GPU) Time : %d ms,  ", (time_used / sDeviceProp.clockRate));
	s_sum[2] = (time_used / sDeviceProp.clockRate);
	printf("\nTransfer rate: %.f MB/s \n", ((double)4 / ((double)(time_used / sDeviceProp.clockRate) / (double)1000)));

	cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
	cudaMalloc((void**)&result, sizeof(int)* THREAD_NUM1);
	cudaMalloc((void**)&time, sizeof(clock_t));
	cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);

	cudaEventRecord(start1, NULL);
	sumOfSquares_512 << <1, THREAD_NUM1, 0 >> >(gpudata, result, time);
	cudaEventRecord(stop1, NULL);
	cudaEventSynchronize(stop1);
	cudaEventElapsedTime(&msecTotal1, start1, stop1);

	int sum512[THREAD_NUM1];
	cudaMemcpy(&sum512, result, sizeof(int)* THREAD_NUM1, cudaMemcpyDeviceToHost);
	cudaMemcpy(&time_used, time, sizeof(clock_t), cudaMemcpyDeviceToHost);

	cudaFree(gpudata);
	cudaFree(result);
	cudaFree(time);

	final_sum = 0;
	for (int i = 0; i <THREAD_NUM1; i++)
	{
	final_sum += sum512[i];
	}

	printf("\n(3-1) Thread Numbers = 512 memory access");
	printf("\n(GPU) Sum : %d\n", final_sum);
	printf("(\nGPU) Time : %d ms ,  ", (time_used / sDeviceProp.clockRate));
	s_sum[3] = (time_used / sDeviceProp.clockRate);
	printf("\nTransfer rate : %.2f MB/s \n", (double)4 / ((double)(time_used / sDeviceProp.clockRate) / (double)1000));

	cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
	cudaMalloc((void**)&result, sizeof(int)* THREAD_NUM2);
	cudaMalloc((void**)&time, sizeof(clock_t));
	cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);

	cudaEventRecord(start1, NULL);
	sumOfSquares_1024 << <1, THREAD_NUM2, 0 >> >(gpudata, result, time);
	cudaEventRecord(stop1, NULL);
	cudaEventSynchronize(stop1);
	cudaEventElapsedTime(&msecTotal1, start1, stop1);

	cudaEventRecord(stop1, NULL);
	cudaEventSynchronize(stop1);
	cudaEventElapsedTime(&msecTotal1, start1, stop1);

	int sum1024[THREAD_NUM2];
	cudaMemcpy(&sum1024, result, sizeof(int)* THREAD_NUM2, cudaMemcpyDeviceToHost);
	cudaMemcpy(&time_used, time, sizeof(clock_t), cudaMemcpyDeviceToHost);

	cudaFree(gpudata);
	cudaFree(result);
	cudaFree(time);

	final_sum = 0;
	for (int i = 0; i <THREAD_NUM2; i++)
	{
	final_sum += sum1024[i];
	}

	printf("\n(3-2) Thread Numbers = 1024 memory access");
	printf("\n(GPU) sum  : %d\n", final_sum);
	printf("(GPU) Time : %f ms, ", msecTotal1);
	s_sum[4] = msecTotal1;
	printf("Transfer rate: %.2f MB/s\n", (double)4 / ((double)(msecTotal1) / (double)1000));

	cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
	cudaMalloc((void**)&result, sizeof(int)* THREAD_NUM * BLOCK_NUM);
	cudaMalloc((void**)&time, sizeof(clock_t)* BLOCK_NUM * 2);
	cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);

	cudaEventRecord(start1, NULL);
	sumOfSquares_block32_thread256 << <BLOCK_NUM, THREAD_NUM, 0 >> >(gpudata, result, time);
	cudaEventRecord(stop1, NULL);
	cudaEventSynchronize(stop1);
	cudaEventElapsedTime(&msecTotal1, start1, stop1);
	QueryPerformanceFrequency(&frequency);
	double quadpart = (double)frequency.QuadPart;
	QueryPerformanceCounter(&timeStart);

	int sum32_256[THREAD_NUM * BLOCK_NUM];
	clock_t time_used32_256[BLOCK_NUM * 2];
	cudaMemcpy(&sum32_256, result, sizeof(int)* THREAD_NUM * BLOCK_NUM, cudaMemcpyDeviceToHost);
	cudaMemcpy(&time_used32_256, time, sizeof(clock_t)* BLOCK_NUM * 2, cudaMemcpyDeviceToHost);

	cudaFree(gpudata);
	cudaFree(result);
	cudaFree(time);

	int final_sum32_256 = 0;
	for (int i = 0; i < THREAD_NUM * BLOCK_NUM; i++)
	{
	final_sum32_256 += sum32_256[i];
	}
	QueryPerformanceCounter(&timeEnd);
	double elapsed = ((timeEnd.QuadPart - timeStart.QuadPart) / quadpart) * 1000;

	clock_t min_start, max_end;
	min_start = time_used32_256[0];
	max_end = time_used32_256[BLOCK_NUM];
	for (int i = 0; i < BLOCK_NUM; i++)
	{
		if (min_start > time_used32_256[i])
		min_start = time_used32_256[i];
		if (max_end < time_used32_256[i + BLOCK_NUM])
		max_end = time_used32_256[i + BLOCK_NUM];
	}
	printf("\n(4) Block Numbers = 32 Threads Numbers = 256 memory access");
	printf("\n(GPU) Sum : %d", final_sum32_256);
	printf("\n(GPU) Time : %f ms, ", msecTotal1);
	printf("\nTransfer rate : %.2f MB/s", (double)4 / ((double)(msecTotal1) / (double)1000));
	printf("\n(CPU) Time : %f ms", elapsed);
	printf("\n(TOTAL) Time : %f ms", elapsed + msecTotal1);
	s_sum[5] = (elapsed + msecTotal1);

	cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
	cudaMalloc((void**)&result, sizeof(int)* THREAD_NUM * BLOCK_NUM);
	cudaMalloc((void**)&time, sizeof(clock_t)* BLOCK_NUM * 2);
	cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);

	cudaEventRecord(start1, NULL);
	sumOfSquares_block32_thread256_shared << <BLOCK_NUM, THREAD_NUM, THREAD_NUM * sizeof(int) >> >(gpudata, result, time);
	cudaEventRecord(stop1, NULL);
	cudaEventSynchronize(stop1);
	cudaEventElapsedTime(&msecTotal1, start1, stop1);

	int sum_shared[BLOCK_NUM];
	clock_t time_used_shared[BLOCK_NUM * 2];
	cudaMemcpy(&sum_shared, result, sizeof(int)* BLOCK_NUM, cudaMemcpyDeviceToHost);
	cudaMemcpy(&time_used_shared, time, sizeof(clock_t)* BLOCK_NUM * 2, cudaMemcpyDeviceToHost);
	cudaFree(gpudata);
	cudaFree(result);
	cudaFree(time);

	int final_sum_shared = 0;

	for (int i = 0; i < BLOCK_NUM; i++)
	{
	final_sum_shared += sum_shared[i];
	}

	printf("\n(5) Block Numbers = 32 Thread Numbers = 256  shared memory, memory access");
	printf("\n(GPU) Sum : %d", final_sum_shared);
	printf("\n(GPU) Time : %f ms, ", msecTotal1);
	s_sum[6] = msecTotal1;
	printf("\nTransfer rate: %.f MB/s", (double)4 / ((double)(msecTotal1) / (double)1000));

	cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
	cudaMalloc((void**)&result, sizeof(int)* THREAD_NUM * BLOCK_NUM);
	cudaMalloc((void**)&time, sizeof(clock_t)* BLOCK_NUM * 2);
	cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);

	cudaEventRecord(start1, NULL);
	sumOfSquares_block32_thread256_shared_treesum << <BLOCK_NUM, THREAD_NUM, THREAD_NUM * sizeof(int) >> >(gpudata, result, time);
	cudaEventRecord(stop1, NULL);
	cudaEventSynchronize(stop1);
	cudaEventElapsedTime(&msecTotal1, start1, stop1);

	int sum_treesum[BLOCK_NUM];
	clock_t time_used_treesum[BLOCK_NUM * 2];
	cudaMemcpy(&sum_treesum, result, sizeof(int)* BLOCK_NUM, cudaMemcpyDeviceToHost);
	cudaMemcpy(&time_used_treesum, time, sizeof(clock_t)* BLOCK_NUM * 2, cudaMemcpyDeviceToHost);
	cudaFree(gpudata);
	cudaFree(result);
	cudaFree(time);

	int final_sum_treesum = 0;
	for (int i = 0; i < BLOCK_NUM; i++)
	{
	final_sum_treesum += sum_treesum[i];
	}

	printf("\n(6-1) Block Numbers = 32 Thread Numbers = 256  shared memory and treesum,  memory access");
	printf("\n(GPU) Sum : %d", final_sum_treesum);
	printf("\n(GPU) Time : %f ms, ", msecTotal1);
	s_sum[7] = msecTotal1;
	printf("\nTransfer rate: %.f MB/s\n", (double)4 / ((double)(msecTotal1) / (double)1000));

	cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
	cudaMalloc((void**)&result, sizeof(int)* THREAD_NUM * BLOCK_NUM);
	cudaMalloc((void**)&time, sizeof(clock_t)* BLOCK_NUM * 2);
	cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);

	cudaEventRecord(start1, NULL);
	sumOfSquares_block32_thread256_shared_treesum2 << <BLOCK_NUM, THREAD_NUM, THREAD_NUM * sizeof(int) >> >(gpudata, result, time);
	cudaEventRecord(stop1, NULL);
	cudaEventSynchronize(stop1);
	cudaEventElapsedTime(&msecTotal1, start1, stop1);

	int sum_treesum2[BLOCK_NUM];
	clock_t time_used_treesum2[BLOCK_NUM * 2];
	cudaMemcpy(&sum_treesum2, result, sizeof(int)* BLOCK_NUM, cudaMemcpyDeviceToHost);
	cudaMemcpy(&time_used_treesum2, time, sizeof(clock_t)* BLOCK_NUM * 2, cudaMemcpyDeviceToHost);
	cudaFree(gpudata);
	cudaFree(result);
	cudaFree(time);

	int final_sum_treesum2 = 0;
	for (int i = 0; i < BLOCK_NUM; i++)
	{
	final_sum_treesum2 += sum_treesum2[i];
	}

	printf("\n(6-2) Block Numbers = 32 Thread Numbers = 256 shared memory and treesum2,  memory access");
	printf("\n(GPU) Sum : %d\n", final_sum_treesum2);
	printf("\n(GPU) Time : %f ms  , ", msecTotal1);
	s_sum[8] = msecTotal1;
	printf("\nTransfer rate: %.f MB/s\n", (double)4 / ((double)(msecTotal1) / (double)1000));

	cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
	cudaMalloc((void**)&result, sizeof(int)* THREAD_NUM * BLOCK_NUM);
	cudaMalloc((void**)&time, sizeof(clock_t)* BLOCK_NUM * 2);
	cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);

	cudaEventRecord(start1, NULL);
	sumOfSquares_block32_thread256_shared_treesum3 << <BLOCK_NUM, THREAD_NUM, THREAD_NUM * sizeof(int) >> >(gpudata, result, time);
	cudaEventRecord(stop1, NULL);
	cudaEventSynchronize(stop1);
	cudaEventElapsedTime(&msecTotal1, start1, stop1);

	int sum_treesum3[BLOCK_NUM];
	clock_t time_used_treesum3[BLOCK_NUM * 2];
	cudaMemcpy(&sum_treesum3, result, sizeof(int)* BLOCK_NUM, cudaMemcpyDeviceToHost);
	cudaMemcpy(&time_used_treesum3, time, sizeof(clock_t)* BLOCK_NUM * 2, cudaMemcpyDeviceToHost);
	cudaFree(gpudata);
	cudaFree(result);
	cudaFree(time);

	int final_sum_treesum3 = 0;
	for (int i = 0; i < BLOCK_NUM; i++)
	{
	final_sum_treesum3 += sum_treesum3[i];
	}

	printf("\n(7) Block Numbers = 32 Thread Numbers = 256 shared memory and treesum3,  memory access");
	printf("\n(GPU) sum  : %d\n", final_sum_treesum3);
	printf("(GPU) Time : %f ms, ", msecTotal1);
	s_sum[9] = msecTotal1;
	printf("Transfer rate: %.f MB/s\n", (double)4 / ((double)(msecTotal1) / (double)1000));

	for (int i = 0; i < 10; i++)
	{
	printf("%0.2f ,", s_sum[0] / s_sum[i]);
	}
	printf("\n ");
	system("\n pause");
	return 0;
}

void GenerateNumbers(int *number, int size)
{
	for (int i = 0; i < size; i++)
	{
	srand(time(0));
	number[i] = rand() % 10;
	}
}

void cpu_SumofSquares(int *number, int size)
{

	int sum = 0;
	clock_t start_time, end_time;

	start_time = clock();
	for (int i = 0; i < size; i++)
	{
	sum += number[i] * number[i];
	}
	end_time = clock();
	printf("\n(CPU) sum : %d\n", sum);
	printf("start_clock: %f  end_clock: %f  \n", (double)start_time, (double)end_time);
	double total_time = (end_time - start_time);
	printf("(CPU) Time  : %f  ms\n", (double)total_time);

}

__global__ static void sumOfSquares(int *num, int* result, clock_t* time)
{
	const int tid = threadIdx.x;
	const int size = DATA_SIZE / THREAD_NUM;
	int sum = 0;
	int i;
	clock_t start = clock();
	for (i = 0; i < DATA_SIZE; i++)
	{
		sum += num[i] * num[i];
	}
	*result = sum;
	*time = clock() - start;
}

__global__ static void sumOfSquares_1(int *num, int* result, clock_t* time)
{
	const int tid = threadIdx.x;
                  const int size = DATA_SIZE / THREAD_NUM;
	int sum = 0;
	int i;
	clock_t start;
	if (tid == 0) start = clock();
	for (i = tid * size; i < (tid + 1)*size; i++)
	{
	sum += num[i] * num[i];
	}
	result[tid] = sum;
	if (tid == 0) *time = clock() - start;
}

__global__ static void sumOfSquares_2(int *num, int* result, clock_t* time)
{
	const int tid = threadIdx.x;
	int sum = 0;
	int i;
	clock_t start;
	if (tid == 0) start = clock();
	for (i = tid; i < DATA_SIZE; i += THREAD_NUM)
	{
	sum += num[i] * num[i];
	}
	result[tid] = sum;
	if (tid == 0) *time = clock() - start;
}

__global__ static void sumOfSquares_512(int *num, int* result, clock_t* time)
{
	const int tid = threadIdx.x;
	int sum = 0;
	int i;
	clock_t start;
	if (tid == 0) start = clock();
	for (i = tid; i < DATA_SIZE; i += THREAD_NUM1)
	{
	sum += num[i] * num[i];
	}
	result[tid] = sum;
	if (tid == 0) *time = clock() - start;
}

__global__ static void sumOfSquares_1024(int *num, int* result, clock_t* time)
{
	const int tid = threadIdx.x;
	int sum = 0;
	int i;
	clock_t start;
	if (tid == 0) start = clock();
	for (i = tid; i < DATA_SIZE; i += THREAD_NUM2)
	{
	sum += num[i] * num[i];
	}
	result[tid] = sum;
	if (tid == 0) *time = clock() - start;
}

__global__ static void sumOfSquares_block32_thread256(int *num, int* result, clock_t* time)
{
	const int tid = threadIdx.x;
	const int bid = blockIdx.x;
	int sum = 0;
	int i;
	if (tid == 0) time[bid] = clock();
	if (time[bid] < 0) time[bid] = -time[bid];
	for (i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM)
	{
	sum += num[i] * num[i];
	}

	result[bid * THREAD_NUM + tid] = sum;
	if (tid == 0) time[bid + BLOCK_NUM] = clock();
	if (time[bid + BLOCK_NUM] < 0) time[bid + BLOCK_NUM] = -time[bid + BLOCK_NUM];
}

__global__ static void sumOfSquares_block32_thread256_shared(int *num, int* result, clock_t* time)
{
	extern __shared__ int shared[];
	const int tid = threadIdx.x;
	const int bid = blockIdx.x;
	int i;
	if (tid == 0) time[bid] = clock();
	shared[tid] = 0;
	for (i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM)
	{
	shared[tid] += num[i] * num[i];
	}
	__syncthreads();
	if (tid == 0) 
                {
	for (i = 1; i < THREAD_NUM; i++)
	{
	shared[0] += shared[i];
	}
	result[bid] = shared[0];
	}
	if (tid == 0) time[bid + BLOCK_NUM] = clock();
}

__global__ static void sumOfSquares_block32_thread256_shared_treesum(int *num, int* result, clock_t* time)
{
	extern __shared__ int shared[];
	const int tid = threadIdx.x;
	const int bid = blockIdx.x;
	int i;
	int offset = 1, mask = 1;
	if (tid == 0) time[bid] = clock();
	shared[tid] = 0;
	for (i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM)
	{
	shared[tid] += num[i] * num[i];
	}
	__syncthreads();

	while (offset < THREAD_NUM)
	{
	if ((tid & mask) == 0)
	{
	shared[tid] += shared[tid + offset];
	}
	offset += offset;
	mask = offset + mask;
	__syncthreads();
	}

	if (tid == 0)
	{
	result[bid] = shared[0];
	time[bid + BLOCK_NUM] = clock();
	}
}

__global__ static void sumOfSquares_block32_thread256_shared_treesum2(int *num, int* result, clock_t* time)
{
	extern __shared__ int shared[];
	const int tid = threadIdx.x;
	const int bid = blockIdx.x;
	int i;
	int offset = 1, mask = 1;
	if (tid == 0) time[bid] = clock();
	shared[tid] = 0;
	for (i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM)
	{
	shared[tid] += num[i] * num[i];
	}
	__syncthreads();

	offset = THREAD_NUM / 2;
	while (offset > 0)
	{
	if (tid < offset)
	{		
                  shared[tid] += shared[tid + offset];
	}
	offset >>= 1;
	__syncthreads();
	}

	if (tid == 0)
	{
	result[bid] = shared[0];
	time[bid + BLOCK_NUM] = clock();
	}
}

__global__ static void sumOfSquares_block32_thread256_shared_treesum3(int *num, int* result, clock_t* time)
{
	extern __shared__ int shared[];
	const int tid = threadIdx.x;
	const int bid = blockIdx.x;
	int i;
	int offset = 1, mask = 1;
	if (tid == 0) time[bid] = clock();
	shared[tid] = 0;
	for (i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM)
	{
	shared[tid] += num[i] * num[i];
	}
	__syncthreads();

	if (tid < 128) { shared[tid] += shared[tid + 128]; }
	__syncthreads();
	if (tid < 64) { shared[tid] += shared[tid + 64]; }
	__syncthreads();
	if (tid < 32) { shared[tid] += shared[tid + 32]; }
	__syncthreads();
	if (tid < 16) { shared[tid] += shared[tid + 16]; }
	__syncthreads();
	if (tid < 8) { shared[tid] += shared[tid + 8]; }
	__syncthreads();
	if (tid < 4) { shared[tid] += shared[tid + 4]; }
	__syncthreads();
	if (tid < 2) { shared[tid] += shared[tid + 2]; }
	__syncthreads();
	if (tid < 1) { shared[tid] += shared[tid + 1]; }
	__syncthreads();

	if (tid == 0) 
                 {
	result[bid] = shared[0];
	time[bid + BLOCK_NUM] = clock();
	}
}

[b]

Kindly please provide your suggestion thus I will able to improve computing skills

[/b]