I want to change FuntionA into FuntionB which is implemented by CUDA. The d_in (input date) is a 2-D Array that its size is widthheight. And the d_out (ouput date) is also a 2-D Array that its size is 512height.
In the FuntionB, the loop as follow has a large calculational cost. Can it be changed into parallel loop which is implemented by CUDA?
for (int i=tid+bid*blockDim.x; i<size; i+=blockDim.x*gridDim.x)
{
if ( i<16 || i>(size-16))
{
tmpv = d_in[i];
}
else
{
tmpv = 0;
for(int k=0; k<16; k++)
{
tmpv += (d_in[i+k] + d_in[i-(k+1)])* tempLowpass[k];
__syncthreads();
}
}
}
FuntionA(int *d_in, char *d_out, int width, int height, char *lowpass, int gene )
{
int *tmp_out1;
int *tmp_out2;
char *tmp_out3;
tmp_out1 = (int *)malloc((width+1024)*sizeof(int));
tmp_out2 = (int *)malloc((width+1024)*sizeof(int));
tmp_out3 = (char *)malloc((width+1024)*sizeof(char));
int tmpv = 0;
for(int i=0; i<height; i++)
{
for(int j=0; j<width; j++)
{
tmpv = 0;
tmp_out1[j] = (int)abs(d_in[i*width+j]);
if(j<16 || j>(width-16))
{
tmp_out2[j] = tmp_out1[j];
}
else
{
for(int k=0; k<16; k++)
{
tmpv += (tmp_out1[j+k] + tmp_out1[j-(k+1)]) * lowpass[k];
}
tmpv = tmpv/128;
tmp_out2[j] = tmpv;
}
if(GetLog(tmp_out2[j]) > 255)
tmp_out3[j] = char(255);
else
tmp_out3[j] = char(GetLog(tmp_out2[j]));
}
for(int j=0; j<512; j++)
{
d_out[512*i+j] = tmp_out3[j*gene];
}
}
free(tmp_out1);
free(tmp_out2);
free(tmp_out3);
}
//--------------------------------------------------------------------------------------------------------------------------//
global void FuntionB(int *d_in, int *d_Temp, char *d_out, int width, int height, char *lowpass, int gene)
{
shared char tempLowpass[16];
for (int k=0; k<16; k++)
{
tempLowpass[k] = lowpass[k];
__syncthreads();
}
int tid = threadIdx.x;
int bid = blockIdx.x;
int size = width * height;
float tmpv;
for (int i=tid+bid*blockDim.x; i<size; i+=blockDim.x*gridDim.x)
{
d_in[i] = (int)abs(d_in[i]);
__syncthreads();
}
for (int i=tid+bid*blockDim.x; i<size; i+=blockDim.x*gridDim.x)
{
if ( i<16 || i>(size-16))
{
tmpv = d_in[i];
}
else
{
tmpv = 0;
for(int k=0; k<16; k++)
{
tmpv += (d_in[i+k] + d_in[i-(k+1)])* tempLowpass[k];
__syncthreads();
}
tmpv = tmpv/128;
}
if(GetLog(tmpv) > 255)
d_Temp[i] = char(255);
else
d_Temp[i] = char(GetLog(tmpv));
__syncthreads();
}
for (int i=tid+bid*blockDim.x; i<512*height; i+=blockDim.x*gridDim.x)
{
d_out[i] = d_Temp[i*30];
__syncthreads();
}
}