Merry Xmas! Can someone help to improve my code?

I want to improve its speed.

I run it on a I7 CPU and the time is 20ms.

then run it on a Geforce GTX 570 and it is 5ms.

so I think we still can improve it.

Thank you guys!!

#include <atltime.h>

#include "cuda_runtime.h"

#include <stdio.h>

#include <stdlib.h>

#include <iostream>

#include <iomanip>

#include <time.h>

//

#define ROW 3000

#define COL 2000

#define DATA_SIZE ROW*COL

#define THREAD_NUM 256

int* data;

int* result;

void GenerateNumbers(int *number, int size)

{

	time_t t;

	srand((unsigned) time(&t));

	for(int i = 0; i < size; i++) {

		number[i] = rand() % 10;

	}

}

__global__ static void CheckInGPU5(size_t _row, size_t _col, int* _data, int* _result, size_t _duration, size_t _pitch_col)

{

	const int tid = threadIdx.x;

	const int row = blockIdx.x;

	if(row<_row)

	{

		extern __shared__ float shared1[];

		for(int i = tid; i < _col; i += blockDim.x) {

			shared1[i] = _data[row * _pitch_col + i];

		}

		__syncthreads();

		int _r=row;

		int _close1=0;

		int _close2=0;

		int _index1=0;

		int _index2=0;

		int _resultIndex=0;

		int _count2=0;

		bool _bRowStart=false;

		bool _bHaveSetValue=false;

		for(int _c=tid; _c<_col; _c+=blockDim.x)

		{

			_index1=_c;

			_index2=_c;

			_resultIndex=_r*_pitch_col+_c;

			_bHaveSetValue=false;

			_bRowStart=false;

			_count2=0;

			if(_c<_duration)

			{

				_result[_resultIndex]=0;

				_bHaveSetValue=true;

				continue;

			}

			_close2=shared1[_index2];

			if(_close2==0)

			{

				_result[_resultIndex]=-1;

				_bHaveSetValue=true;

				continue;

			}

			for(int _d=0;_d<_duration;_d++)

			{

				if(_index1>0)

				{

					_close2=shared1[_index2];

					_index1=_index2-1;

					_close1=shared1[_index1];

					while(_close1==0)

					{

						if(_index1>0)

						{

							_index1--;

							_close1=shared1[_index1];

							if(_close1>0)

								break;

						}

						else

						{

							_bRowStart=true;

							break;

						}

					}

					if(_bRowStart==true && _count2==0)

					{

						_result[_resultIndex]=0;

						_bHaveSetValue=true;

						break;

					}

					if(_bRowStart==true)

					{

						_result[_resultIndex]=1;

						_bHaveSetValue=true;

						break;

					}

					_count2++;

					_close1=shared1[_index1];

					if(_close2<_close1)

					{

						_result[_resultIndex]=-2;

						_bHaveSetValue=true;

						break;

					}

					_index2=_index1;

				}

			}

			if(_bHaveSetValue==false)

				_result[_resultIndex]=1;

			continue;

		}

	}

}

int main()

{

	data = (int*)malloc(sizeof(int) * ROW * COL);

	result = (int*)malloc(sizeof(int) * ROW * COL);

	GenerateNumbers(data, DATA_SIZE);

	LARGE_INTEGER litmp;

	LONGLONG Qpart1,Qpart2;

	double dfMinus,dfMillionSecond,dfFreq;

	QueryPerformanceFrequency(&litmp);

	dfFreq = (double)litmp.QuadPart;

	int *gpudata, *resultDev;

	int* resultGPU = (int*)malloc(sizeof(int) * ROW * COL);

	///////////////pitch/////////////

	size_t pitch_data, pitch_result;

	cudaMallocPitch((void**) &gpudata, &pitch_data, sizeof(int) * COL, ROW);

	cudaMemset2D(gpudata, pitch_data, 9, COL*sizeof(int), ROW);

	cudaMallocPitch((void**) &resultDev, &pitch_result, sizeof(int) * COL, ROW);

	cudaMemset2D(resultDev, pitch_result, 9, COL*sizeof(int), ROW);

	cudaMemcpy2D(gpudata, pitch_data, data, sizeof(int) * COL, sizeof(int) * COL, ROW, cudaMemcpyHostToDevice);

	QueryPerformanceCounter(&litmp);

	Qpart1 = litmp.QuadPart;

	CheckInGPU5<<<ROW, THREAD_NUM, sizeof(int) * COL>>>(ROW,COL,gpudata,resultDev,3,pitch_data/sizeof(int));

	cudaMemcpy2D(resultGPU, sizeof(int) * COL, resultDev, pitch_result, sizeof(int) * COL, ROW, cudaMemcpyDeviceToHost);

	//////////////pitch///////////////

	QueryPerformanceCounter(&litmp);

	Qpart2 = litmp.QuadPart;

	cudaFree(gpudata);

	cudaFree(resultDev);

	printf("GPU Result(pass:%.2fms):\n",((double)(Qpart2 - Qpart1)/dfFreq)*1000);

	delete data;

	data=NULL;

	delete result;

	result=NULL;

	delete resultGPU;

	resultGPU=NULL;

	system("pause");

	return 0;

}

which type of improvement you want?

if you want anytype of editing then pm me

but i cant create a new code i only can edit
bcz i didnt learnt i only know bcz i’ve some sites

sopm me what you want i’ll try to help you :)

Thank you kind guy, I will pm you later!

Hello,

You need first to find out where is the bottleneck. If you run the visual profiler you can get good informations.

Thank you! I am learning how to use profiler now.

Happy new year! But i cant help if you provide very lil documentation about your code!
btw,
The performance counters exist between your ears.

Happy new year! But i cant help if you provide very lil documentation about your code!
btw,
The performance counters exist between your ears.