I want to improve its speed.
I run it on a I7 CPU and the time is 20ms.
then run it on a Geforce GTX 570 and it is 5ms.
so I think we still can improve it.
Thank you guys!!
#include <atltime.h>
#include "cuda_runtime.h"
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <iomanip>
#include <time.h>
//
#define ROW 3000
#define COL 2000
#define DATA_SIZE ROW*COL
#define THREAD_NUM 256
int* data;
int* result;
void GenerateNumbers(int *number, int size)
{
time_t t;
srand((unsigned) time(&t));
for(int i = 0; i < size; i++) {
number[i] = rand() % 10;
}
}
__global__ static void CheckInGPU5(size_t _row, size_t _col, int* _data, int* _result, size_t _duration, size_t _pitch_col)
{
const int tid = threadIdx.x;
const int row = blockIdx.x;
if(row<_row)
{
extern __shared__ float shared1[];
for(int i = tid; i < _col; i += blockDim.x) {
shared1[i] = _data[row * _pitch_col + i];
}
__syncthreads();
int _r=row;
int _close1=0;
int _close2=0;
int _index1=0;
int _index2=0;
int _resultIndex=0;
int _count2=0;
bool _bRowStart=false;
bool _bHaveSetValue=false;
for(int _c=tid; _c<_col; _c+=blockDim.x)
{
_index1=_c;
_index2=_c;
_resultIndex=_r*_pitch_col+_c;
_bHaveSetValue=false;
_bRowStart=false;
_count2=0;
if(_c<_duration)
{
_result[_resultIndex]=0;
_bHaveSetValue=true;
continue;
}
_close2=shared1[_index2];
if(_close2==0)
{
_result[_resultIndex]=-1;
_bHaveSetValue=true;
continue;
}
for(int _d=0;_d<_duration;_d++)
{
if(_index1>0)
{
_close2=shared1[_index2];
_index1=_index2-1;
_close1=shared1[_index1];
while(_close1==0)
{
if(_index1>0)
{
_index1--;
_close1=shared1[_index1];
if(_close1>0)
break;
}
else
{
_bRowStart=true;
break;
}
}
if(_bRowStart==true && _count2==0)
{
_result[_resultIndex]=0;
_bHaveSetValue=true;
break;
}
if(_bRowStart==true)
{
_result[_resultIndex]=1;
_bHaveSetValue=true;
break;
}
_count2++;
_close1=shared1[_index1];
if(_close2<_close1)
{
_result[_resultIndex]=-2;
_bHaveSetValue=true;
break;
}
_index2=_index1;
}
}
if(_bHaveSetValue==false)
_result[_resultIndex]=1;
continue;
}
}
}
int main()
{
data = (int*)malloc(sizeof(int) * ROW * COL);
result = (int*)malloc(sizeof(int) * ROW * COL);
GenerateNumbers(data, DATA_SIZE);
LARGE_INTEGER litmp;
LONGLONG Qpart1,Qpart2;
double dfMinus,dfMillionSecond,dfFreq;
QueryPerformanceFrequency(&litmp);
dfFreq = (double)litmp.QuadPart;
int *gpudata, *resultDev;
int* resultGPU = (int*)malloc(sizeof(int) * ROW * COL);
///////////////pitch/////////////
size_t pitch_data, pitch_result;
cudaMallocPitch((void**) &gpudata, &pitch_data, sizeof(int) * COL, ROW);
cudaMemset2D(gpudata, pitch_data, 9, COL*sizeof(int), ROW);
cudaMallocPitch((void**) &resultDev, &pitch_result, sizeof(int) * COL, ROW);
cudaMemset2D(resultDev, pitch_result, 9, COL*sizeof(int), ROW);
cudaMemcpy2D(gpudata, pitch_data, data, sizeof(int) * COL, sizeof(int) * COL, ROW, cudaMemcpyHostToDevice);
QueryPerformanceCounter(&litmp);
Qpart1 = litmp.QuadPart;
CheckInGPU5<<<ROW, THREAD_NUM, sizeof(int) * COL>>>(ROW,COL,gpudata,resultDev,3,pitch_data/sizeof(int));
cudaMemcpy2D(resultGPU, sizeof(int) * COL, resultDev, pitch_result, sizeof(int) * COL, ROW, cudaMemcpyDeviceToHost);
//////////////pitch///////////////
QueryPerformanceCounter(&litmp);
Qpart2 = litmp.QuadPart;
cudaFree(gpudata);
cudaFree(resultDev);
printf("GPU Result(pass:%.2fms):\n",((double)(Qpart2 - Qpart1)/dfFreq)*1000);
delete data;
data=NULL;
delete result;
result=NULL;
delete resultGPU;
resultGPU=NULL;
system("pause");
return 0;
}