How to modify the Cuda output

tronorinoyeong · May 21, 2018, 1:11am

[b]

Bandwidth test: test memory bandwidth.
Especially important for PCIE capability. Different MB has different PCIE capability.
The CUDA adaptor performance is depend on the capability of PCIE. It could be the performance bottleneck.

On the following programming drills, the number of clock cycles necessary for computation and utilised memory bandwidth have to be reported.

(1) parallelization in the programs - using 256 threads

(2) improving the memory access modes

(3) testing the parallelization by using 512/1024

(4) utilizing BLOCKS in the computation

(5) utilizing shared memory

(6) improving the computation performance by using a Treesum algorithm

(7) resolving the memory bank conflict issue, encountered in applying Treesum algorithm with the shared memory
[/b]

My Cuda code below can run but not sure how to modify the output -

#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <windows.h>

void initCUDA(const cudaDeviceProp sDevProp);
void GenerateNumbers(int *number, int size);
void cpu_SumofSquares(int *number, int size);

__global__ static void sumOfSquares(int *num, int* result, clock_t* time);
__global__ static void sumOfSquares_1(int *num, int* result, clock_t* time);
__global__ static void sumOfSquares_2(int *num, int* result, clock_t* time);
__global__ static void sumOfSquares_512(int *num, int* result, clock_t* time);
__global__ static void sumOfSquares_1024(int *num, int* result, clock_t* time);
__global__ static void sumOfSquares_block32_thread256(int *num, int* result, clock_t* time);
__global__ static void sumOfSquares_block32_thread256_shared(int *num, int* result, clock_t* time);
__global__ static void sumOfSquares_block32_thread256_shared_treesum(int *num, int* result, clock_t* time);
__global__ static void sumOfSquares_block32_thread256_shared_treesum2(int *num, int* result, clock_t* time);
__global__ static void sumOfSquares_block32_thread256_shared_treesum3(int *num, int* result, clock_t* time);

#define DATA_SIZE 1048576
#define THREAD_NUM 256
#define THREAD_NUM1 512
#define THREAD_NUM2 1024
#define BLOCK_NUM 32
int data[DATA_SIZE];

int main()
{
	int iDeviceCount = 0;
	cudaGetDeviceCount(&iDeviceCount);
	cudaDeviceProp  sDeviceProp;
	cudaGetDeviceProperties(&sDeviceProp, 0);
	GenerateNumbers(data, DATA_SIZE);

	LARGE_INTEGER timeStart;
	LARGE_INTEGER timeEnd;
	LARGE_INTEGER frequency;
	double s_sum[9];

	int *gpudata, *result;
	clock_t* time;

	cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
	cudaMalloc((void**)&result, sizeof(int));
	cudaMalloc((void**)&time, sizeof(clock_t));
	cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);

	cudaEvent_t start1;
	cudaEvent_t stop1;
	cudaEventCreate(&start1);
	cudaEventCreate(&stop1);
	cudaEventRecord(start1, NULL);

	sumOfSquares << <1, 1, 0 >> >(gpudata, result, time);

	cudaEventRecord(stop1, NULL);
	cudaEventSynchronize(stop1);
	float msecTotal1 = 0.0f;
	cudaEventElapsedTime(&msecTotal1, start1, stop1);

	int sum1 = 0;
	clock_t time_used;
	cudaMemcpy(&sum1, result, sizeof(int), cudaMemcpyDeviceToHost);
	cudaMemcpy(&time_used, time, sizeof(clock_t), cudaMemcpyDeviceToHost);

	printf("THREAD_NUM=1");
	printf("\n(GPU) sum  : %d\n", sum1);
	printf("(GPU) Time : %d ms ,  ", (time_used / sDeviceProp.clockRate));
	s_sum[0] = (time_used / sDeviceProp.clockRate);
	printf("Transfer rate: %.2f MB/s \n", (double)4 / ((double)(time_used / sDeviceProp.clockRate) / (double)1000));

	cudaFree(gpudata);
	cudaFree(result);
	cudaFree(time);

	cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
	cudaMalloc((void**)&result, sizeof(int)* THREAD_NUM);
	cudaMalloc((void**)&time, sizeof(clock_t));
	cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);

	cudaEventRecord(start1, NULL);
	sumOfSquares_1 << <1, THREAD_NUM, 0 >> >(gpudata, result, time);
	cudaEventRecord(stop1, NULL);
	cudaEventSynchronize(stop1);
	cudaEventElapsedTime(&msecTotal1, start1, stop1);

	int sum[THREAD_NUM];
	cudaMemcpy(&sum, result, sizeof(int)* THREAD_NUM, cudaMemcpyDeviceToHost);
	cudaMemcpy(&time_used, time, sizeof(clock_t), cudaMemcpyDeviceToHost);

	cudaFree(gpudata);
	cudaFree(result);
	cudaFree(time);

	int final_sum = 0;
	for (int i = 0; i <THREAD_NUM; i++)
	{
	final_sum += sum[i];
	}

	printf("\n(1) Thread Numbers = 256");
	printf("\n(GPU) Sum : %d", final_sum);
	printf("\n(GPU) Time : %d ms,  ", (time_used / sDeviceProp.clockRate));
	s_sum[1] = (time_used / sDeviceProp.clockRate);
	printf("\nTransfer rate : %.f MB/s", (double)4 / ((double)(time_used / sDeviceProp.clockRate) / (double)1000));

	cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
	cudaMalloc((void**)&result, sizeof(int)* THREAD_NUM);
	cudaMalloc((void**)&time, sizeof(clock_t));
	cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);

	cudaEventRecord(start1, NULL);
	sumOfSquares_2 << <1, THREAD_NUM, 0 >> >(gpudata, result, time);
	cudaEventRecord(stop1, NULL);
	cudaEventSynchronize(stop1);
	cudaEventElapsedTime(&msecTotal1, start1, stop1);

	cudaMemcpy(&sum, result, sizeof(int)* THREAD_NUM, cudaMemcpyDeviceToHost);
	cudaMemcpy(&time_used, time, sizeof(clock_t), cudaMemcpyDeviceToHost);

	cudaFree(gpudata);
	cudaFree(result);
	cudaFree(time);

	final_sum = 0;
	for (int i = 0; i <THREAD_NUM; i++)
	{
	final_sum += sum[i];
	}

	printf("\n(2) Thread Numbers = 256 memory access");
	printf("\n(GPU) Sum : %d\n", final_sum);
	printf("\n(GPU) Time : %d ms,  ", (time_used / sDeviceProp.clockRate));
	s_sum[2] = (time_used / sDeviceProp.clockRate);
	printf("\nTransfer rate: %.f MB/s \n", ((double)4 / ((double)(time_used / sDeviceProp.clockRate) / (double)1000)));

	cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
	cudaMalloc((void**)&result, sizeof(int)* THREAD_NUM1);
	cudaMalloc((void**)&time, sizeof(clock_t));
	cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);

	cudaEventRecord(start1, NULL);
	sumOfSquares_512 << <1, THREAD_NUM1, 0 >> >(gpudata, result, time);
	cudaEventRecord(stop1, NULL);
	cudaEventSynchronize(stop1);
	cudaEventElapsedTime(&msecTotal1, start1, stop1);

	int sum512[THREAD_NUM1];
	cudaMemcpy(&sum512, result, sizeof(int)* THREAD_NUM1, cudaMemcpyDeviceToHost);
	cudaMemcpy(&time_used, time, sizeof(clock_t), cudaMemcpyDeviceToHost);

	cudaFree(gpudata);
	cudaFree(result);
	cudaFree(time);

	final_sum = 0;
	for (int i = 0; i <THREAD_NUM1; i++)
	{
	final_sum += sum512[i];
	}

	printf("\n(3-1) Thread Numbers = 512 memory access");
	printf("\n(GPU) Sum : %d\n", final_sum);
	printf("(\nGPU) Time : %d ms ,  ", (time_used / sDeviceProp.clockRate));
	s_sum[3] = (time_used / sDeviceProp.clockRate);
	printf("\nTransfer rate : %.2f MB/s \n", (double)4 / ((double)(time_used / sDeviceProp.clockRate) / (double)1000));

	cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
	cudaMalloc((void**)&result, sizeof(int)* THREAD_NUM2);
	cudaMalloc((void**)&time, sizeof(clock_t));
	cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);

	cudaEventRecord(start1, NULL);
	sumOfSquares_1024 << <1, THREAD_NUM2, 0 >> >(gpudata, result, time);
	cudaEventRecord(stop1, NULL);
	cudaEventSynchronize(stop1);
	cudaEventElapsedTime(&msecTotal1, start1, stop1);

	cudaEventRecord(stop1, NULL);
	cudaEventSynchronize(stop1);
	cudaEventElapsedTime(&msecTotal1, start1, stop1);

	int sum1024[THREAD_NUM2];
	cudaMemcpy(&sum1024, result, sizeof(int)* THREAD_NUM2, cudaMemcpyDeviceToHost);
	cudaMemcpy(&time_used, time, sizeof(clock_t), cudaMemcpyDeviceToHost);

	cudaFree(gpudata);
	cudaFree(result);
	cudaFree(time);

	final_sum = 0;
	for (int i = 0; i <THREAD_NUM2; i++)
	{
	final_sum += sum1024[i];
	}

	printf("\n(3-2) Thread Numbers = 1024 memory access");
	printf("\n(GPU) sum  : %d\n", final_sum);
	printf("(GPU) Time : %f ms, ", msecTotal1);
	s_sum[4] = msecTotal1;
	printf("Transfer rate: %.2f MB/s\n", (double)4 / ((double)(msecTotal1) / (double)1000));

	cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
	cudaMalloc((void**)&result, sizeof(int)* THREAD_NUM * BLOCK_NUM);
	cudaMalloc((void**)&time, sizeof(clock_t)* BLOCK_NUM * 2);
	cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);

	cudaEventRecord(start1, NULL);
	sumOfSquares_block32_thread256 << <BLOCK_NUM, THREAD_NUM, 0 >> >(gpudata, result, time);
	cudaEventRecord(stop1, NULL);
	cudaEventSynchronize(stop1);
	cudaEventElapsedTime(&msecTotal1, start1, stop1);
	QueryPerformanceFrequency(&frequency);
	double quadpart = (double)frequency.QuadPart;
	QueryPerformanceCounter(&timeStart);

	int sum32_256[THREAD_NUM * BLOCK_NUM];
	clock_t time_used32_256[BLOCK_NUM * 2];
	cudaMemcpy(&sum32_256, result, sizeof(int)* THREAD_NUM * BLOCK_NUM, cudaMemcpyDeviceToHost);
	cudaMemcpy(&time_used32_256, time, sizeof(clock_t)* BLOCK_NUM * 2, cudaMemcpyDeviceToHost);

	cudaFree(gpudata);
	cudaFree(result);
	cudaFree(time);

	int final_sum32_256 = 0;
	for (int i = 0; i < THREAD_NUM * BLOCK_NUM; i++)
	{
	final_sum32_256 += sum32_256[i];
	}
	QueryPerformanceCounter(&timeEnd);
	double elapsed = ((timeEnd.QuadPart - timeStart.QuadPart) / quadpart) * 1000;

	clock_t min_start, max_end;
	min_start = time_used32_256[0];
	max_end = time_used32_256[BLOCK_NUM];
	for (int i = 0; i < BLOCK_NUM; i++)
	{
		if (min_start > time_used32_256[i])
		min_start = time_used32_256[i];
		if (max_end < time_used32_256[i + BLOCK_NUM])
		max_end = time_used32_256[i + BLOCK_NUM];
	}
	printf("\n(4) Block Numbers = 32 Threads Numbers = 256 memory access");
	printf("\n(GPU) Sum : %d", final_sum32_256);
	printf("\n(GPU) Time : %f ms, ", msecTotal1);
	printf("\nTransfer rate : %.2f MB/s", (double)4 / ((double)(msecTotal1) / (double)1000));
	printf("\n(CPU) Time : %f ms", elapsed);
	printf("\n(TOTAL) Time : %f ms", elapsed + msecTotal1);
	s_sum[5] = (elapsed + msecTotal1);

	cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
	cudaMalloc((void**)&result, sizeof(int)* THREAD_NUM * BLOCK_NUM);
	cudaMalloc((void**)&time, sizeof(clock_t)* BLOCK_NUM * 2);
	cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);

	cudaEventRecord(start1, NULL);
	sumOfSquares_block32_thread256_shared << <BLOCK_NUM, THREAD_NUM, THREAD_NUM * sizeof(int) >> >(gpudata, result, time);
	cudaEventRecord(stop1, NULL);
	cudaEventSynchronize(stop1);
	cudaEventElapsedTime(&msecTotal1, start1, stop1);

	int sum_shared[BLOCK_NUM];
	clock_t time_used_shared[BLOCK_NUM * 2];
	cudaMemcpy(&sum_shared, result, sizeof(int)* BLOCK_NUM, cudaMemcpyDeviceToHost);
	cudaMemcpy(&time_used_shared, time, sizeof(clock_t)* BLOCK_NUM * 2, cudaMemcpyDeviceToHost);
	cudaFree(gpudata);
	cudaFree(result);
	cudaFree(time);

	int final_sum_shared = 0;

	for (int i = 0; i < BLOCK_NUM; i++)
	{
	final_sum_shared += sum_shared[i];
	}

	printf("\n(5) Block Numbers = 32 Thread Numbers = 256  shared memory, memory access");
	printf("\n(GPU) Sum : %d", final_sum_shared);
	printf("\n(GPU) Time : %f ms, ", msecTotal1);
	s_sum[6] = msecTotal1;
	printf("\nTransfer rate: %.f MB/s", (double)4 / ((double)(msecTotal1) / (double)1000));

	cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
	cudaMalloc((void**)&result, sizeof(int)* THREAD_NUM * BLOCK_NUM);
	cudaMalloc((void**)&time, sizeof(clock_t)* BLOCK_NUM * 2);
	cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);

	cudaEventRecord(start1, NULL);
	sumOfSquares_block32_thread256_shared_treesum << <BLOCK_NUM, THREAD_NUM, THREAD_NUM * sizeof(int) >> >(gpudata, result, time);
	cudaEventRecord(stop1, NULL);
	cudaEventSynchronize(stop1);
	cudaEventElapsedTime(&msecTotal1, start1, stop1);

	int sum_treesum[BLOCK_NUM];
	clock_t time_used_treesum[BLOCK_NUM * 2];
	cudaMemcpy(&sum_treesum, result, sizeof(int)* BLOCK_NUM, cudaMemcpyDeviceToHost);
	cudaMemcpy(&time_used_treesum, time, sizeof(clock_t)* BLOCK_NUM * 2, cudaMemcpyDeviceToHost);
	cudaFree(gpudata);
	cudaFree(result);
	cudaFree(time);

	int final_sum_treesum = 0;
	for (int i = 0; i < BLOCK_NUM; i++)
	{
	final_sum_treesum += sum_treesum[i];
	}

	printf("\n(6-1) Block Numbers = 32 Thread Numbers = 256  shared memory and treesum,  memory access");
	printf("\n(GPU) Sum : %d", final_sum_treesum);
	printf("\n(GPU) Time : %f ms, ", msecTotal1);
	s_sum[7] = msecTotal1;
	printf("\nTransfer rate: %.f MB/s\n", (double)4 / ((double)(msecTotal1) / (double)1000));

	cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
	cudaMalloc((void**)&result, sizeof(int)* THREAD_NUM * BLOCK_NUM);
	cudaMalloc((void**)&time, sizeof(clock_t)* BLOCK_NUM * 2);
	cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);

	cudaEventRecord(start1, NULL);
	sumOfSquares_block32_thread256_shared_treesum2 << <BLOCK_NUM, THREAD_NUM, THREAD_NUM * sizeof(int) >> >(gpudata, result, time);
	cudaEventRecord(stop1, NULL);
	cudaEventSynchronize(stop1);
	cudaEventElapsedTime(&msecTotal1, start1, stop1);

	int sum_treesum2[BLOCK_NUM];
	clock_t time_used_treesum2[BLOCK_NUM * 2];
	cudaMemcpy(&sum_treesum2, result, sizeof(int)* BLOCK_NUM, cudaMemcpyDeviceToHost);
	cudaMemcpy(&time_used_treesum2, time, sizeof(clock_t)* BLOCK_NUM * 2, cudaMemcpyDeviceToHost);
	cudaFree(gpudata);
	cudaFree(result);
	cudaFree(time);

	int final_sum_treesum2 = 0;
	for (int i = 0; i < BLOCK_NUM; i++)
	{
	final_sum_treesum2 += sum_treesum2[i];
	}

	printf("\n(6-2) Block Numbers = 32 Thread Numbers = 256 shared memory and treesum2,  memory access");
	printf("\n(GPU) Sum : %d\n", final_sum_treesum2);
	printf("\n(GPU) Time : %f ms  , ", msecTotal1);
	s_sum[8] = msecTotal1;
	printf("\nTransfer rate: %.f MB/s\n", (double)4 / ((double)(msecTotal1) / (double)1000));

	cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
	cudaMalloc((void**)&result, sizeof(int)* THREAD_NUM * BLOCK_NUM);
	cudaMalloc((void**)&time, sizeof(clock_t)* BLOCK_NUM * 2);
	cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);

	cudaEventRecord(start1, NULL);
	sumOfSquares_block32_thread256_shared_treesum3 << <BLOCK_NUM, THREAD_NUM, THREAD_NUM * sizeof(int) >> >(gpudata, result, time);
	cudaEventRecord(stop1, NULL);
	cudaEventSynchronize(stop1);
	cudaEventElapsedTime(&msecTotal1, start1, stop1);

	int sum_treesum3[BLOCK_NUM];
	clock_t time_used_treesum3[BLOCK_NUM * 2];
	cudaMemcpy(&sum_treesum3, result, sizeof(int)* BLOCK_NUM, cudaMemcpyDeviceToHost);
	cudaMemcpy(&time_used_treesum3, time, sizeof(clock_t)* BLOCK_NUM * 2, cudaMemcpyDeviceToHost);
	cudaFree(gpudata);
	cudaFree(result);
	cudaFree(time);

	int final_sum_treesum3 = 0;
	for (int i = 0; i < BLOCK_NUM; i++)
	{
	final_sum_treesum3 += sum_treesum3[i];
	}

	printf("\n(7) Block Numbers = 32 Thread Numbers = 256 shared memory and treesum3,  memory access");
	printf("\n(GPU) sum  : %d\n", final_sum_treesum3);
	printf("(GPU) Time : %f ms, ", msecTotal1);
	s_sum[9] = msecTotal1;
	printf("Transfer rate: %.f MB/s\n", (double)4 / ((double)(msecTotal1) / (double)1000));

	for (int i = 0; i < 10; i++)
	{
	printf("%0.2f ,", s_sum[0] / s_sum[i]);
	}
	printf("\n ");
	system("\n pause");
	return 0;
}

void GenerateNumbers(int *number, int size)
{
	for (int i = 0; i < size; i++)
	{
	srand(time(0));
	number[i] = rand() % 10;
	}
}

void cpu_SumofSquares(int *number, int size)
{

	int sum = 0;
	clock_t start_time, end_time;

	start_time = clock();
	for (int i = 0; i < size; i++)
	{
	sum += number[i] * number[i];
	}
	end_time = clock();
	printf("\n(CPU) sum : %d\n", sum);
	printf("start_clock: %f  end_clock: %f  \n", (double)start_time, (double)end_time);
	double total_time = (end_time - start_time);
	printf("(CPU) Time  : %f  ms\n", (double)total_time);

}

__global__ static void sumOfSquares(int *num, int* result, clock_t* time)
{
	const int tid = threadIdx.x;
	const int size = DATA_SIZE / THREAD_NUM;
	int sum = 0;
	int i;
	clock_t start = clock();
	for (i = 0; i < DATA_SIZE; i++)
	{
		sum += num[i] * num[i];
	}
	*result = sum;
	*time = clock() - start;
}

__global__ static void sumOfSquares_1(int *num, int* result, clock_t* time)
{
	const int tid = threadIdx.x;
                  const int size = DATA_SIZE / THREAD_NUM;
	int sum = 0;
	int i;
	clock_t start;
	if (tid == 0) start = clock();
	for (i = tid * size; i < (tid + 1)*size; i++)
	{
	sum += num[i] * num[i];
	}
	result[tid] = sum;
	if (tid == 0) *time = clock() - start;
}

__global__ static void sumOfSquares_2(int *num, int* result, clock_t* time)
{
	const int tid = threadIdx.x;
	int sum = 0;
	int i;
	clock_t start;
	if (tid == 0) start = clock();
	for (i = tid; i < DATA_SIZE; i += THREAD_NUM)
	{
	sum += num[i] * num[i];
	}
	result[tid] = sum;
	if (tid == 0) *time = clock() - start;
}

__global__ static void sumOfSquares_512(int *num, int* result, clock_t* time)
{
	const int tid = threadIdx.x;
	int sum = 0;
	int i;
	clock_t start;
	if (tid == 0) start = clock();
	for (i = tid; i < DATA_SIZE; i += THREAD_NUM1)
	{
	sum += num[i] * num[i];
	}
	result[tid] = sum;
	if (tid == 0) *time = clock() - start;
}

__global__ static void sumOfSquares_1024(int *num, int* result, clock_t* time)
{
	const int tid = threadIdx.x;
	int sum = 0;
	int i;
	clock_t start;
	if (tid == 0) start = clock();
	for (i = tid; i < DATA_SIZE; i += THREAD_NUM2)
	{
	sum += num[i] * num[i];
	}
	result[tid] = sum;
	if (tid == 0) *time = clock() - start;
}

__global__ static void sumOfSquares_block32_thread256(int *num, int* result, clock_t* time)
{
	const int tid = threadIdx.x;
	const int bid = blockIdx.x;
	int sum = 0;
	int i;
	if (tid == 0) time[bid] = clock();
	if (time[bid] < 0) time[bid] = -time[bid];
	for (i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM)
	{
	sum += num[i] * num[i];
	}

	result[bid * THREAD_NUM + tid] = sum;
	if (tid == 0) time[bid + BLOCK_NUM] = clock();
	if (time[bid + BLOCK_NUM] < 0) time[bid + BLOCK_NUM] = -time[bid + BLOCK_NUM];
}

__global__ static void sumOfSquares_block32_thread256_shared(int *num, int* result, clock_t* time)
{
	extern __shared__ int shared[];
	const int tid = threadIdx.x;
	const int bid = blockIdx.x;
	int i;
	if (tid == 0) time[bid] = clock();
	shared[tid] = 0;
	for (i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM)
	{
	shared[tid] += num[i] * num[i];
	}
	__syncthreads();
	if (tid == 0) 
                {
	for (i = 1; i < THREAD_NUM; i++)
	{
	shared[0] += shared[i];
	}
	result[bid] = shared[0];
	}
	if (tid == 0) time[bid + BLOCK_NUM] = clock();
}

__global__ static void sumOfSquares_block32_thread256_shared_treesum(int *num, int* result, clock_t* time)
{
	extern __shared__ int shared[];
	const int tid = threadIdx.x;
	const int bid = blockIdx.x;
	int i;
	int offset = 1, mask = 1;
	if (tid == 0) time[bid] = clock();
	shared[tid] = 0;
	for (i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM)
	{
	shared[tid] += num[i] * num[i];
	}
	__syncthreads();

	while (offset < THREAD_NUM)
	{
	if ((tid & mask) == 0)
	{
	shared[tid] += shared[tid + offset];
	}
	offset += offset;
	mask = offset + mask;
	__syncthreads();
	}

	if (tid == 0)
	{
	result[bid] = shared[0];
	time[bid + BLOCK_NUM] = clock();
	}
}

__global__ static void sumOfSquares_block32_thread256_shared_treesum2(int *num, int* result, clock_t* time)
{
	extern __shared__ int shared[];
	const int tid = threadIdx.x;
	const int bid = blockIdx.x;
	int i;
	int offset = 1, mask = 1;
	if (tid == 0) time[bid] = clock();
	shared[tid] = 0;
	for (i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM)
	{
	shared[tid] += num[i] * num[i];
	}
	__syncthreads();

	offset = THREAD_NUM / 2;
	while (offset > 0)
	{
	if (tid < offset)
	{		
                  shared[tid] += shared[tid + offset];
	}
	offset >>= 1;
	__syncthreads();
	}

	if (tid == 0)
	{
	result[bid] = shared[0];
	time[bid + BLOCK_NUM] = clock();
	}
}

__global__ static void sumOfSquares_block32_thread256_shared_treesum3(int *num, int* result, clock_t* time)
{
	extern __shared__ int shared[];
	const int tid = threadIdx.x;
	const int bid = blockIdx.x;
	int i;
	int offset = 1, mask = 1;
	if (tid == 0) time[bid] = clock();
	shared[tid] = 0;
	for (i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM)
	{
	shared[tid] += num[i] * num[i];
	}
	__syncthreads();

	if (tid < 128) { shared[tid] += shared[tid + 128]; }
	__syncthreads();
	if (tid < 64) { shared[tid] += shared[tid + 64]; }
	__syncthreads();
	if (tid < 32) { shared[tid] += shared[tid + 32]; }
	__syncthreads();
	if (tid < 16) { shared[tid] += shared[tid + 16]; }
	__syncthreads();
	if (tid < 8) { shared[tid] += shared[tid + 8]; }
	__syncthreads();
	if (tid < 4) { shared[tid] += shared[tid + 4]; }
	__syncthreads();
	if (tid < 2) { shared[tid] += shared[tid + 2]; }
	__syncthreads();
	if (tid < 1) { shared[tid] += shared[tid + 1]; }
	__syncthreads();

	if (tid == 0) 
                 {
	result[bid] = shared[0];
	time[bid + BLOCK_NUM] = clock();
	}
}

[b]

Kindly please provide your suggestion thus I will able to improve computing skills

[/b]

Topic		Replies	Views
Why cannot run this program CUDA Programming and Performance	7	951	May 11, 2018
How to perform CUDA programming to calculate and process the correct planning CUDA Programming and Performance	3	522	April 28, 2018
Cuda program taking more time. CUDA Programming and Performance	15	7056	November 21, 2010
How to use CUDA programming to calculate and process the correct number CUDA Programming and Performance	1	698	April 20, 2018
newbie in Cuda needs help with 2D arrays CUDA Programming and Performance	9	921	March 9, 2018
Memory problem? ...incredible slowdown CUDA Programming and Performance	29	16288	January 30, 2011
Understanding and adjusting Mark Harris's array reduction CUDA Programming and Performance	11	4081	August 26, 2018
2D reduction using CUDA The use a cuda and cublas library for a 2D simple reduction CUDA Programming and Performance	11	4384	February 7, 2012
Problem about time of copy data through shared memory CUDA Programming and Performance	4	352	March 30, 2024
[Beginner]: CUDA slower than serial implementation fill Operation on entire image CUDA Programming and Performance	18	13516	September 15, 2011

How to modify the Cuda output

Related topics