Actually, The kernel is just that one and that’s all. But, I’ll take more detail.
void* routine( void *pvoidData ) {
DataStruct *data = (DataStruct*)pvoidData;
unsigned char *dev_IMAGE;
int *dev_MEM;
unsigned char *IMAGE_SEG = data->IMAGE_SEG;
HANDLE_ERROR(cudaSetDevice(data->deviceID));
//initialize array
memset(IMAGE_SEG, 0, WIDTH*HEIGHTs*CHANNELS);
printf("Device %d Starting..\n", data->deviceID);
//Evaluate Time
cudaEvent_t start, stop;
cudaEventCreate( &start );
cudaEventCreate( &stop );
HANDLE_ERROR( cudaMalloc( (void **)&dev_MEM, sizeof(int)*256) ); //Creating int array each Block for same SIMD environment
HANDLE_ERROR( cudaMalloc( (void **)&dev_IMAGE, sizeof(unsigned char)*WIDTH*HEIGHTs*CHANNELS) ); //output array
cudaMemcpy(dev_MEM, MEM, sizeof(int)*256, cudaMemcpyHostToDevice);
cudaMemset(dev_IMAGE, 0, sizeof(unsigned char)*WIDTH*HEIGHTs*CHANNELS);
dim3 grid(WIDTH/TILE_WIDTH, HEIGHTs/TILE_HEIGHT); //blocks in a grid
dim3 block(TILE_WIDTH, TILE_HEIGHT); //threads in a block
cudaEventRecord(start, 0);
PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, data->deviceID, 0, 1, 2); //Start the Kernel
PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, data->deviceID, 0, 2, 3); //Start the Kernel
PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, data->deviceID, 0, 3, 4); //Start the Kernel
PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, data->deviceID, 0, 4, 5); //Start the Kernel
PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, data->deviceID, 3, 2, 4); //Start the Kernel
PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, data->deviceID, 2, 6, 4); //Start the Kernel
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
HANDLE_ERROR( cudaMemcpy( IMAGE_SEG, dev_IMAGE, sizeof(unsigned char)*WIDTH*HEIGHTs*CHANNELS, cudaMemcpyDeviceToHost ) );
HANDLE_ERROR(cudaFree(dev_MEM));
HANDLE_ERROR( cudaFree( dev_IMAGE ) );
cudaEventElapsedTime( &elapsed_time_ms[data->deviceID], start, stop ); //Calculate elapsed time
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("Algorithm Elapsed Time : %f ms(Device %d)\n", elapsed_time_ms[data->deviceID], data->deviceID);
printf("Device %d Complete!\n", data->deviceID);
return 0;
}
__global__ void PRINT_POLYGON( unsigned char *IMAGEin, int *MEMin, int dev_ID, int a, int b, int c)
{
int i = blockIdx.x*TILE_WIDTH + threadIdx.x;
int j = blockIdx.y*TILE_HEIGHT + threadIdx.y;
float result_a, result_b;
// int k;
//#pragma unroll 5
// for(k = 0; k < 5; k++){
// temp[k] = a*5+k;
// temp[k+5] = b*5+k;
// temp[k+10] = c*5+k;
// }
int result_a_up = ((MEMin[c*5+1]-MEMin[a*5+1])*(i-MEMin[a*5]))-((MEMin[c*5]-MEMin[a*5])*(j-MEMin[a*5+1]));
int result_a_down = ((MEMin[c*5+1]-MEMin[a*5+1])*(MEMin[b*5]-MEMin[a*5]))-((MEMin[b*5+1]-MEMin[a*5+1])*(MEMin[c*5]-MEMin[a*5]));
int result_b_up = ((MEMin[b*5+1] -MEMin[a*5+1])*(MEMin[a*5]-i))-((MEMin[b*5] -MEMin[a*5])*(MEMin[a*5+1]-j));
int result_b_down = ((MEMin[c*5+1]-MEMin[a*5+1])*(MEMin[b*5]-MEMin[a*5]))-((MEMin[b*5+1]-MEMin[a*5+1])*(MEMin[c*5]-MEMin[a*5]));
result_a = float(result_a_up) / float(result_a_down);
result_b = float(result_b_up) / float(result_b_down);
int isIn = (0 <= result_a && result_a <=1) && ((0 <= result_b && result_b <= 1)) && ((0 <= (result_a+result_b) && (result_a+result_b) <= 1));
IMAGEin[(i*HEIGHTs+j)*CHANNELS+(2-0)] += (int)(float(MEMin[a*5+2]) + (float(MEMin[b*5+2])-float(MEMin[a*5+2]))*result_a + (float(MEMin[c*5+2])-float(MEMin[a*5+2]))*result_b)*isIn; //Red Channel
IMAGEin[(i*HEIGHTs+j)*CHANNELS+(2-1)] += (int)(float(MEMin[a*5+3]) + (float(MEMin[b*5+3])-float(MEMin[a*5+3]))*result_a + (float(MEMin[c*5+3])-float(MEMin[a*5+3]))*result_b)*isIn; //Green Channel
IMAGEin[(i*HEIGHTs+j)*CHANNELS+(2-2)] += (int)(float(MEMin[a*5+4]) + (float(MEMin[b*5+4])-float(MEMin[a*5+4]))*result_a + (float(MEMin[c*5+4])-float(MEMin[a*5+4]))*result_b)*isIn; //Blue Channel maybe...
}