Hello, I am a student from China, I just learn OpenACC an be new to the concept about “routine�
Here I have encountered a problem: When I use a parallel loop gang to call a rountine worker, the loop in routine seemed to be do nothing, and I have look for many information (from books and Internet) and change my code again and again and again to resolve this problem, but failed, here is my code:
- first show a correct code:
(This is some code from other files:)
#define SUCCEED 0
#define DATATYPE float
typedef struct _Para_3D
{
CPTYPE WIDTH;
CPTYPE HEIGHT;
CPTYPE DEPTH;
} Para_3D;
typedef struct _Para_2D
{
CPTYPE WIDTH;
CPTYPE HEIGHT;
} Para_2D;
I have these code in TestOpenACC.h
typedef struct _testconvlayer
{
//Parameter
Para_3D tv_inputPara;//输入图�数
Para_3D tv_filterPara;//filter�数
Para_3D tv_outputPara;//输出图�数
CPTYPE tv_filter_number;//filter数�
CPTYPE tv_stride;//æ¥é•¿
//Data
DATATYPE ****restrict filter_weights_ACC;//3ç»´æ�ƒé‡�å�‚数矩阵 √ 最终销æ¯�å†…å˜æ—¶ç»Ÿä¸€å¤„ç�†
DATATYPE ***restrict tv_input_array;//输入图片数组 √ -
DATATYPE ***restrict tv_output_array_ACC;//输出三维矩阵 √ 最终销æ¯�å†…å˜æ—¶ç»Ÿä¸€å¤„ç�†
} TestConvLayer;
RESULTTYPE TOA_conv3D(
DATATYPE const ***const _input_array, Para_3D const *const _input3DPara,
DATATYPE const ****const _filter_array, Para_3D const *const _filter3DPara,
DATATYPE ***const _output_array, Para_3D const *const _output3DPara, CPTYPE const *const _depth_index,
CPTYPE const *const _stride);
void TOA_testConv();
And these code in TestOpenACC.c
#pragma acc routine worker
RESULTTYPE TOA_conv3D(
DATATYPE const ***const _input_array, Para_3D const *const _input3DPara,
DATATYPE const ****const _filter_array, Para_3D const *const _filter3DPara,
DATATYPE ***const _output_array, Para_3D const *const _output3DPara, CPTYPE const *const _depth_index,
CPTYPE const *const _stride)
{
#pragma acc loop collapse(2) independent
for (CPTYPE output_index_Y = 0; output_index_Y < _output3DPara->HEIGHT; output_index_Y++)
{
for (CPTYPE output_index_X = 0; output_index_X < _output3DPara->WIDTH; output_index_X++)
{
_output_array[*_depth_index][output_index_Y][output_index_X] = 999;//写入输出数组
}
}
return SUCCEED;
}
void TOA_testConv()
{
TestConvLayer *testConvLayer = malloc(sizeof(TestConvLayer));
{//Init
DATATYPE low = -10, high = -1.0f * low, zero = 0, one = 1, two = 2;
testConvLayer->tv_stride = 1;
{//Parameter of input
testConvLayer->tv_inputPara.DEPTH = 3;
testConvLayer->tv_inputPara.HEIGHT = 5;
testConvLayer->tv_inputPara.WIDTH = 5;
}
{//Parameter of input of filter
testConvLayer->tv_filterPara.DEPTH = 3;
testConvLayer->tv_filterPara.HEIGHT = 3;
testConvLayer->tv_filterPara.WIDTH = 3;
testConvLayer->tv_filter_number = 2;
}
{//Parameter of output
testConvLayer->tv_outputPara.DEPTH = 2;
testConvLayer->tv_outputPara.HEIGHT = 3;
testConvLayer->tv_outputPara.WIDTH = 3;
}
testConvLayer->tv_input_array = CPU_T_random_uniform3D(&low, &high, &testConvLayer->tv_inputPara);
testConvLayer->filter_weights_ACC = malloc(sizeof(DATATYPE ***) * testConvLayer->tv_filter_number);
for (CPTYPE filter_index = 0; filter_index < testConvLayer->tv_filter_number; filter_index++)
{
testConvLayer->filter_weights_ACC[filter_index] = CPU_T_random_uniform3D(&low, &high, &testConvLayer->tv_filterPara);
}
testConvLayer->tv_output_array_ACC = CPU_T_array3D(&testConvLayer->tv_outputPara, &one);
#pragma acc enter data copyin(testConvLayer[0:1])
#pragma acc enter data copyin(testConvLayer->tv_input_array[0:testConvLayer->tv_inputPara.DEPTH][0:testConvLayer->tv_inputPara.HEIGHT][0:testConvLayer->tv_inputPara.WIDTH])
#pragma acc enter data copyin(testConvLayer->filter_weights_ACC[0:testConvLayer->tv_filter_number][0:testConvLayer->tv_filterPara.DEPTH][0:testConvLayer->tv_filterPara.HEIGHT][0:testConvLayer->tv_filterPara.WIDTH])
#pragma acc enter data create(testConvLayer->tv_output_array_ACC[0:testConvLayer->tv_outputPara.DEPTH][0:testConvLayer->tv_outputPara.HEIGHT][0:testConvLayer->tv_outputPara.WIDTH])
}
{//show array data
printf("--------------------show initial information--------------------\n");
printf("Input Array\n");
CPU_T_show3Darray(testConvLayer->tv_input_array, &testConvLayer->tv_inputPara);
printf("Weight Array\n");
for (CPTYPE filter_index = 0; filter_index < testConvLayer->tv_filter_number; filter_index++)
{
printf("Weight Array[%d]\n", filter_index + 1);
CPU_T_show3Darray(testConvLayer->filter_weights_ACC[filter_index], &testConvLayer->tv_filterPara);
}
printf("Output Array(before update)\n");
CPU_T_show3Darray(testConvLayer->tv_output_array_ACC, &testConvLayer->tv_outputPara);
}
#pragma acc update host(testConvLayer->tv_output_array_ACC[0:testConvLayer->tv_outputPara.DEPTH][0:testConvLayer->tv_outputPara.HEIGHT][0:testConvLayer->tv_outputPara.WIDTH])
{//show array data
printf("--------------------show initial information--------------------\n");
printf("Input Array\n");
CPU_T_show3Darray(testConvLayer->tv_input_array, &testConvLayer->tv_inputPara);
printf("Weight Array\n");
for (CPTYPE filter_index = 0; filter_index < testConvLayer->tv_filter_number; filter_index++)
{
printf("Weight Array[%d]\n", filter_index + 1);
CPU_T_show3Darray(testConvLayer->filter_weights_ACC[filter_index], &testConvLayer->tv_filterPara);
}
printf("Output Array(after update)\n");
CPU_T_show3Darray(testConvLayer->tv_output_array_ACC, &testConvLayer->tv_outputPara);
}
#pragma acc parallel loop gang present(testConvLayer[0:1], \
testConvLayer->tv_input_array[0:testConvLayer->tv_inputPara.DEPTH][0:testConvLayer->tv_inputPara.HEIGHT][0:testConvLayer->tv_inputPara.WIDTH], \
testConvLayer->filter_weights_ACC[0:testConvLayer->tv_filter_number][0:testConvLayer->tv_filterPara.DEPTH][0:testConvLayer->tv_filterPara.HEIGHT][0:testConvLayer->tv_filterPara.WIDTH], \
testConvLayer->tv_output_array_ACC[0:testConvLayer->tv_outputPara.DEPTH][0:testConvLayer->tv_outputPara.HEIGHT][0:testConvLayer->tv_outputPara.WIDTH])\
num_workers(1)
for (CPTYPE filter_index = 0; filter_index < testConvLayer->tv_filter_number; filter_index++)
{
TOA_conv3D(testConvLayer->tv_input_array, &testConvLayer->tv_inputPara,
testConvLayer->filter_weights_ACC, &testConvLayer->tv_filterPara,
testConvLayer->tv_output_array_ACC, &testConvLayer->tv_outputPara, &filter_index,
&testConvLayer->tv_stride);
}
{//show array data
printf("--------------------show initial information--------------------\n");
printf("Input Array\n");
CPU_T_show3Darray(testConvLayer->tv_input_array, &testConvLayer->tv_inputPara);
printf("Weight Array\n");
for (CPTYPE filter_index = 0; filter_index < testConvLayer->tv_filter_number; filter_index++)
{
printf("Weight Array[%d]\n", filter_index + 1);
CPU_T_show3Darray(testConvLayer->filter_weights_ACC[filter_index], &testConvLayer->tv_filterPara);
}
printf("Output Array\n");
CPU_T_show3Darray(testConvLayer->tv_output_array_ACC, &testConvLayer->tv_outputPara);
}
{//delete data in gpu
#pragma acc exit data delete(testConvLayer->tv_input_array[0:testConvLayer->tv_inputPara.DEPTH][0:testConvLayer->tv_inputPara.HEIGHT][0:testConvLayer->tv_inputPara.WIDTH])
#pragma acc exit data delete(testConvLayer->filter_weights_ACC[0:testConvLayer->tv_filter_number][0:testConvLayer->tv_filterPara.DEPTH][0:testConvLayer->tv_filterPara.HEIGHT][0:testConvLayer->tv_filterPara.WIDTH])
#pragma acc exit data delete(testConvLayer->tv_output_array_ACC[0:testConvLayer->tv_outputPara.DEPTH][0:testConvLayer->tv_outputPara.HEIGHT][0:testConvLayer->tv_outputPara.WIDTH])
#pragma acc exit data delete(testConvLayer[0:1])
}
}