Puzzled problem:call routine from parallel

HiJacker · August 2, 2017, 1:35am

Hello, I am a student from China, I just learn OpenACC an be new to the concept about â€œroutineâ€�

Here I have encountered a problem: When I use a parallel loop gang to call a rountine worker, the loop in routine seemed to be do nothing, and I have look for many information (from books and Internet) and change my code again and again and again to resolve this problem, but failed, here is my code:

first show a correct code:
(This is some code from other files:)

#define SUCCEED 0

#define DATATYPE float

typedef struct _Para_3D
{
    CPTYPE WIDTH;
    CPTYPE HEIGHT;
    CPTYPE DEPTH;
} Para_3D;

typedef struct _Para_2D
{
    CPTYPE WIDTH;
    CPTYPE HEIGHT;
} Para_2D;

I have these code in TestOpenACC.h

typedef struct _testconvlayer
{
    //Parameter
    Para_3D tv_inputPara;//è¾“å…¥å›¾å�‚æ•°
    Para_3D tv_filterPara;//filterå�‚æ•°
    Para_3D tv_outputPara;//è¾“å‡ºå›¾å�‚æ•°
    CPTYPE tv_filter_number;//filteræ•°é‡�
    CPTYPE tv_stride;//æ¥é•¿

    //Data
    DATATYPE ****restrict filter_weights_ACC;//3ç»´æ�ƒé‡�å�‚æ•°çŸ©é˜µ âˆš æœ€ç»ˆé”€æ¯�å†…å˜æ—¶ç»Ÿä¸€å¤„ç�†
    DATATYPE ***restrict tv_input_array;//è¾“å…¥å›¾ç‰‡æ•°ç»„ âˆš -
    DATATYPE ***restrict tv_output_array_ACC;//è¾“å‡ºä¸‰ç»´çŸ©é˜µ âˆš æœ€ç»ˆé”€æ¯�å†…å˜æ—¶ç»Ÿä¸€å¤„ç�†
} TestConvLayer;

RESULTTYPE TOA_conv3D(
            DATATYPE const ***const _input_array, Para_3D const *const _input3DPara,
            DATATYPE const ****const _filter_array, Para_3D const *const _filter3DPara,
            DATATYPE ***const _output_array, Para_3D const *const _output3DPara, CPTYPE const *const _depth_index,
            CPTYPE const *const _stride);

void TOA_testConv();

And these code in TestOpenACC.c

#pragma acc routine worker

RESULTTYPE TOA_conv3D(
            DATATYPE const ***const _input_array, Para_3D const *const _input3DPara,
            DATATYPE const ****const _filter_array, Para_3D const *const _filter3DPara,
            DATATYPE ***const _output_array, Para_3D const *const _output3DPara, CPTYPE const *const _depth_index,
            CPTYPE const *const _stride)
{
#pragma acc loop collapse(2) independent
    for (CPTYPE output_index_Y = 0; output_index_Y < _output3DPara->HEIGHT; output_index_Y++)
    {
        for (CPTYPE output_index_X = 0; output_index_X < _output3DPara->WIDTH; output_index_X++)
        {
            _output_array[*_depth_index][output_index_Y][output_index_X] = 999;//å†™å…¥è¾“å‡ºæ•°ç»„
        }
    }
    return SUCCEED;
}

void TOA_testConv()
{
    TestConvLayer *testConvLayer = malloc(sizeof(TestConvLayer));

    {//Init
        DATATYPE low = -10, high = -1.0f * low, zero = 0, one = 1, two = 2;

        testConvLayer->tv_stride = 1;
        {//Parameter of input
            testConvLayer->tv_inputPara.DEPTH = 3;
            testConvLayer->tv_inputPara.HEIGHT = 5;
            testConvLayer->tv_inputPara.WIDTH = 5;
        }
        {//Parameter of input of filter
            testConvLayer->tv_filterPara.DEPTH = 3;
            testConvLayer->tv_filterPara.HEIGHT = 3;
            testConvLayer->tv_filterPara.WIDTH = 3;
            testConvLayer->tv_filter_number = 2;
        }
        {//Parameter of output
            testConvLayer->tv_outputPara.DEPTH = 2;
            testConvLayer->tv_outputPara.HEIGHT = 3;
            testConvLayer->tv_outputPara.WIDTH = 3;
        }
        testConvLayer->tv_input_array = CPU_T_random_uniform3D(&low, &high, &testConvLayer->tv_inputPara);
        testConvLayer->filter_weights_ACC = malloc(sizeof(DATATYPE ***) * testConvLayer->tv_filter_number);
        for (CPTYPE filter_index = 0; filter_index < testConvLayer->tv_filter_number; filter_index++)
        {
            testConvLayer->filter_weights_ACC[filter_index] = CPU_T_random_uniform3D(&low, &high, &testConvLayer->tv_filterPara);
        }
        testConvLayer->tv_output_array_ACC = CPU_T_array3D(&testConvLayer->tv_outputPara, &one);
#pragma acc enter data copyin(testConvLayer[0:1])
#pragma acc enter data copyin(testConvLayer->tv_input_array[0:testConvLayer->tv_inputPara.DEPTH][0:testConvLayer->tv_inputPara.HEIGHT][0:testConvLayer->tv_inputPara.WIDTH])
#pragma acc enter data copyin(testConvLayer->filter_weights_ACC[0:testConvLayer->tv_filter_number][0:testConvLayer->tv_filterPara.DEPTH][0:testConvLayer->tv_filterPara.HEIGHT][0:testConvLayer->tv_filterPara.WIDTH])
#pragma acc enter data create(testConvLayer->tv_output_array_ACC[0:testConvLayer->tv_outputPara.DEPTH][0:testConvLayer->tv_outputPara.HEIGHT][0:testConvLayer->tv_outputPara.WIDTH])
    }

    {//show array data
        printf("--------------------show initial information--------------------\n");
        printf("Input Array\n");
        CPU_T_show3Darray(testConvLayer->tv_input_array, &testConvLayer->tv_inputPara);
        printf("Weight Array\n");
        for (CPTYPE filter_index = 0; filter_index < testConvLayer->tv_filter_number; filter_index++)
        {
            printf("Weight Array[%d]\n", filter_index + 1);
            CPU_T_show3Darray(testConvLayer->filter_weights_ACC[filter_index], &testConvLayer->tv_filterPara);
        }
        printf("Output Array(before update)\n");
        CPU_T_show3Darray(testConvLayer->tv_output_array_ACC, &testConvLayer->tv_outputPara);
    }

#pragma acc update host(testConvLayer->tv_output_array_ACC[0:testConvLayer->tv_outputPara.DEPTH][0:testConvLayer->tv_outputPara.HEIGHT][0:testConvLayer->tv_outputPara.WIDTH])
    {//show array data
        printf("--------------------show initial information--------------------\n");
        printf("Input Array\n");
        CPU_T_show3Darray(testConvLayer->tv_input_array, &testConvLayer->tv_inputPara);
        printf("Weight Array\n");
        for (CPTYPE filter_index = 0; filter_index < testConvLayer->tv_filter_number; filter_index++)
        {
            printf("Weight Array[%d]\n", filter_index + 1);
            CPU_T_show3Darray(testConvLayer->filter_weights_ACC[filter_index], &testConvLayer->tv_filterPara);
        }
        printf("Output Array(after update)\n");
        CPU_T_show3Darray(testConvLayer->tv_output_array_ACC, &testConvLayer->tv_outputPara);
    }

#pragma acc parallel loop gang present(testConvLayer[0:1], \
    testConvLayer->tv_input_array[0:testConvLayer->tv_inputPara.DEPTH][0:testConvLayer->tv_inputPara.HEIGHT][0:testConvLayer->tv_inputPara.WIDTH], \
    testConvLayer->filter_weights_ACC[0:testConvLayer->tv_filter_number][0:testConvLayer->tv_filterPara.DEPTH][0:testConvLayer->tv_filterPara.HEIGHT][0:testConvLayer->tv_filterPara.WIDTH], \
    testConvLayer->tv_output_array_ACC[0:testConvLayer->tv_outputPara.DEPTH][0:testConvLayer->tv_outputPara.HEIGHT][0:testConvLayer->tv_outputPara.WIDTH])\
    num_workers(1)
    for (CPTYPE filter_index = 0; filter_index < testConvLayer->tv_filter_number; filter_index++)
    {
        TOA_conv3D(testConvLayer->tv_input_array, &testConvLayer->tv_inputPara,
                   testConvLayer->filter_weights_ACC, &testConvLayer->tv_filterPara,
                   testConvLayer->tv_output_array_ACC, &testConvLayer->tv_outputPara, &filter_index,
                   &testConvLayer->tv_stride);
    }

    {//show array data
        printf("--------------------show initial information--------------------\n");
        printf("Input Array\n");
        CPU_T_show3Darray(testConvLayer->tv_input_array, &testConvLayer->tv_inputPara);
        printf("Weight Array\n");
        for (CPTYPE filter_index = 0; filter_index < testConvLayer->tv_filter_number; filter_index++)
        {
            printf("Weight Array[%d]\n", filter_index + 1);
            CPU_T_show3Darray(testConvLayer->filter_weights_ACC[filter_index], &testConvLayer->tv_filterPara);
        }
        printf("Output Array\n");
        CPU_T_show3Darray(testConvLayer->tv_output_array_ACC, &testConvLayer->tv_outputPara);
    }

    {//delete data in gpu
#pragma acc exit data delete(testConvLayer->tv_input_array[0:testConvLayer->tv_inputPara.DEPTH][0:testConvLayer->tv_inputPara.HEIGHT][0:testConvLayer->tv_inputPara.WIDTH])
#pragma acc exit data delete(testConvLayer->filter_weights_ACC[0:testConvLayer->tv_filter_number][0:testConvLayer->tv_filterPara.DEPTH][0:testConvLayer->tv_filterPara.HEIGHT][0:testConvLayer->tv_filterPara.WIDTH])
#pragma acc exit data delete(testConvLayer->tv_output_array_ACC[0:testConvLayer->tv_outputPara.DEPTH][0:testConvLayer->tv_outputPara.HEIGHT][0:testConvLayer->tv_outputPara.WIDTH])
#pragma acc exit data delete(testConvLayer[0:1])
    }
}

HiJacker · August 2, 2017, 1:36am

And the output is:

--------------------show initial information--------------------
Input Array
 -2.874  -6.777   4.342   4.207  -1.622 
  9.291  -7.907   7.580  -2.390  -9.619 
  1.219   3.565   0.599   3.997  -9.662 
  1.962  -4.179  -7.274   6.126   3.079 
 -4.147  -4.790   8.984   6.474  -8.906 

  2.271   7.838   1.749   4.609   2.488 
 -3.607  -8.265   5.711  -9.265   5.942 
 -5.911  -9.973   8.035  -8.332  -2.363 
  8.417   2.888  -8.798  -0.984  -3.115 
 -8.461  -9.022   2.706  -5.734   7.103 

 -4.215   0.118  -7.687  -5.231  -3.408 
 -6.592   7.040  -5.570   5.157   1.649 
  6.918  -8.450   3.385   2.629  -7.714 
 -0.673   6.718  -7.688  -2.638   8.386 
 -0.051  -4.221   1.274   1.150   4.795 

Weight Array
Weight Array[1]
  8.159   2.689   5.773 
  0.866   6.955   2.876 
  6.651  -2.927   5.189 

 -8.580   3.665   8.596 
  8.460   8.095   3.753 
  0.109   5.014   5.304 

 -6.506  -2.357   7.589 
  2.821  -5.640   9.901 
 -9.816  -7.253  -0.150 

Weight Array[2]
 -4.037   4.021  -9.000 
 -9.243   2.180   3.690 
  6.530  -6.954   0.645 

 -0.594   9.697   7.718 
 -5.405  -8.884   1.383 
 -6.809   9.576  -0.522 

  6.944  -0.314  -5.508 
  2.248   3.180   2.134 
 -0.163  -3.999   6.495 

Output Array(before update)
  1.000   1.000   1.000 
  1.000   1.000   1.000 
  1.000   1.000   1.000 

  1.000   1.000   1.000 
  1.000   1.000   1.000 
  1.000   1.000   1.000 

--------------------show initial information--------------------
Input Array
 -2.874  -6.777   4.342   4.207  -1.622 
  9.291  -7.907   7.580  -2.390  -9.619 
  1.219   3.565   0.599   3.997  -9.662 
  1.962  -4.179  -7.274   6.126   3.079 
 -4.147  -4.790   8.984   6.474  -8.906 

  2.271   7.838   1.749   4.609   2.488 
 -3.607  -8.265   5.711  -9.265   5.942 
 -5.911  -9.973   8.035  -8.332  -2.363 
  8.417   2.888  -8.798  -0.984  -3.115 
 -8.461  -9.022   2.706  -5.734   7.103 

 -4.215   0.118  -7.687  -5.231  -3.408 
 -6.592   7.040  -5.570   5.157   1.649 
  6.918  -8.450   3.385   2.629  -7.714 
 -0.673   6.718  -7.688  -2.638   8.386 
 -0.051  -4.221   1.274   1.150   4.795 

Weight Array
Weight Array[1]
  8.159   2.689   5.773 
  0.866   6.955   2.876 
  6.651  -2.927   5.189 

 -8.580   3.665   8.596 
  8.460   8.095   3.753 
  0.109   5.014   5.304 

 -6.506  -2.357   7.589 
  2.821  -5.640   9.901 
 -9.816  -7.253  -0.150 

Weight Array[2]
 -4.037   4.021  -9.000 
 -9.243   2.180   3.690 
  6.530  -6.954   0.645 

 -0.594   9.697   7.718 
 -5.405  -8.884   1.383 
 -6.809   9.576  -0.522 

  6.944  -0.314  -5.508 
  2.248   3.180   2.134 
 -0.163  -3.999   6.495 

Output Array(after update)
  0.000   0.000   0.000 
  0.000   0.000   0.000 
  0.000   0.000   0.000 

  0.000   0.000   0.000 
  0.000   0.000   0.000 
  0.000   0.000   0.000 

--------------------show initial information--------------------
Input Array
 -2.874  -6.777   4.342   4.207  -1.622 
  9.291  -7.907   7.580  -2.390  -9.619 
  1.219   3.565   0.599   3.997  -9.662 
  1.962  -4.179  -7.274   6.126   3.079 
 -4.147  -4.790   8.984   6.474  -8.906 

  2.271   7.838   1.749   4.609   2.488 
 -3.607  -8.265   5.711  -9.265   5.942 
 -5.911  -9.973   8.035  -8.332  -2.363 
  8.417   2.888  -8.798  -0.984  -3.115 
 -8.461  -9.022   2.706  -5.734   7.103 

 -4.215   0.118  -7.687  -5.231  -3.408 
 -6.592   7.040  -5.570   5.157   1.649 
  6.918  -8.450   3.385   2.629  -7.714 
 -0.673   6.718  -7.688  -2.638   8.386 
 -0.051  -4.221   1.274   1.150   4.795 

Weight Array
Weight Array[1]
  8.159   2.689   5.773 
  0.866   6.955   2.876 
  6.651  -2.927   5.189 

 -8.580   3.665   8.596 
  8.460   8.095   3.753 
  0.109   5.014   5.304 

 -6.506  -2.357   7.589 
  2.821  -5.640   9.901 
 -9.816  -7.253  -0.150 

Weight Array[2]
 -4.037   4.021  -9.000 
 -9.243   2.180   3.690 
  6.530  -6.954   0.645 

 -0.594   9.697   7.718 
 -5.405  -8.884   1.383 
 -6.809   9.576  -0.522 

  6.944  -0.314  -5.508 
  2.248   3.180   2.134 
 -0.163  -3.999   6.495 

Output Array
999.000 999.000 999.000 
999.000 999.000 999.000 
999.000 999.000 999.000 

999.000 999.000 999.000 
999.000 999.000 999.000 
999.000 999.000 999.000

HiJacker · August 2, 2017, 1:37am

then I change the code to a more complex way, there is something wrong~~
I just change the code in function â€œTOA_testConv()â€� in â€œTestOpenACC.câ€�

#pragma acc data present(testConvLayer[0:1], \
    testConvLayer->tv_input_array[0:testConvLayer->tv_inputPara.DEPTH][0:testConvLayer->tv_inputPara.HEIGHT][0:testConvLayer->tv_inputPara.WIDTH], \
    testConvLayer->filter_weights_ACC[0:testConvLayer->tv_filter_number][0:testConvLayer->tv_filterPara.DEPTH][0:testConvLayer->tv_filterPara.HEIGHT][0:testConvLayer->tv_filterPara.WIDTH], \
    testConvLayer->tv_output_array_ACC[0:testConvLayer->tv_outputPara.DEPTH][0:testConvLayer->tv_outputPara.HEIGHT][0:testConvLayer->tv_outputPara.WIDTH])
    {
        for (CPTYPE filter_index = 0; filter_index < testConvLayer->tv_filter_number; filter_index++)
        {
            TOA_conv3D(testConvLayer->tv_input_array, &testConvLayer->tv_inputPara,
                       testConvLayer->filter_weights_ACC, &testConvLayer->tv_filterPara,
                       testConvLayer->tv_output_array_ACC, &testConvLayer->tv_outputPara, &filter_index,
                       &testConvLayer->tv_stride);
        }
    }

And the output is: (The difference between this and output before is only the last six lines)

--------------------show initial information--------------------
Input Array
  1.187   2.783  -3.790  -8.796  -1.941 
  5.036   2.132  -2.366  -7.224   9.760 
 -0.783  -1.866   1.615   7.014   8.169 
  1.632   6.412   8.553   6.599   7.989 
 -6.686   5.248   5.816   5.806  -8.147 

 -2.467   5.316  -8.835  -8.757   9.422 
  9.081   2.430   2.205  -4.708   3.635 
 -9.736  -9.672  -4.234  -2.102  -6.896 
 -4.474   7.115   1.238   7.141   4.128 
 -0.593  -1.227   0.540  -2.040  -4.627 

 -1.471   1.274  -9.379  -5.656  -2.920 
 -7.526   1.877  -7.604  -6.361   3.120 
 -8.182  -7.280  -4.450   4.023  -1.988 
  9.185   4.286  -1.661  -5.049  -7.816 
  1.443   0.477   9.299  -7.320  -2.382 

Weight Array
Weight Array[1]
  3.427   2.087   6.392 
 -6.033  -9.953  -8.236 
  2.495   1.321  -7.614 

  6.839   8.401  -5.140 
 -1.284  -9.203  -1.502 
 -8.164  -7.386   1.218 

 -2.613   6.637   9.230 
 -3.429   0.923  -2.430 
  1.522   3.107   9.012 

Weight Array[2]
 -8.001   2.406  -8.307 
 -0.383  -4.168   3.780 
 -3.991  -0.201   3.827 

 -2.226  -7.706  -4.852 
  0.159   9.134  -6.451 
  5.019  -2.150  -5.654 

 -6.483  -0.314  -3.040 
  4.735   7.073  -6.403 
  3.966  -6.356   4.520 

Output Array(before update)
  1.000   1.000   1.000 
  1.000   1.000   1.000 
  1.000   1.000   1.000 

  1.000   1.000   1.000 
  1.000   1.000   1.000 
  1.000   1.000   1.000 

--------------------show initial information--------------------
Input Array
  1.187   2.783  -3.790  -8.796  -1.941 
  5.036   2.132  -2.366  -7.224   9.760 
 -0.783  -1.866   1.615   7.014   8.169 
  1.632   6.412   8.553   6.599   7.989 
 -6.686   5.248   5.816   5.806  -8.147 

 -2.467   5.316  -8.835  -8.757   9.422 
  9.081   2.430   2.205  -4.708   3.635 
 -9.736  -9.672  -4.234  -2.102  -6.896 
 -4.474   7.115   1.238   7.141   4.128 
 -0.593  -1.227   0.540  -2.040  -4.627 

 -1.471   1.274  -9.379  -5.656  -2.920 
 -7.526   1.877  -7.604  -6.361   3.120 
 -8.182  -7.280  -4.450   4.023  -1.988 
  9.185   4.286  -1.661  -5.049  -7.816 
  1.443   0.477   9.299  -7.320  -2.382 

Weight Array
Weight Array[1]
  3.427   2.087   6.392 
 -6.033  -9.953  -8.236 
  2.495   1.321  -7.614 

  6.839   8.401  -5.140 
 -1.284  -9.203  -1.502 
 -8.164  -7.386   1.218 

 -2.613   6.637   9.230 
 -3.429   0.923  -2.430 
  1.522   3.107   9.012 

Weight Array[2]
 -8.001   2.406  -8.307 
 -0.383  -4.168   3.780 
 -3.991  -0.201   3.827 

 -2.226  -7.706  -4.852 
  0.159   9.134  -6.451 
  5.019  -2.150  -5.654 

 -6.483  -0.314  -3.040 
  4.735   7.073  -6.403 
  3.966  -6.356   4.520 

Output Array(after update)
  0.000   0.000   0.000 
  0.000   0.000   0.000 
  0.000   0.000   0.000 

  0.000   0.000   0.000 
  0.000   0.000   0.000 
  0.000   0.000   0.000 

--------------------show initial information--------------------
Input Array
  1.187   2.783  -3.790  -8.796  -1.941 
  5.036   2.132  -2.366  -7.224   9.760 
 -0.783  -1.866   1.615   7.014   8.169 
  1.632   6.412   8.553   6.599   7.989 
 -6.686   5.248   5.816   5.806  -8.147 

 -2.467   5.316  -8.835  -8.757   9.422 
  9.081   2.430   2.205  -4.708   3.635 
 -9.736  -9.672  -4.234  -2.102  -6.896 
 -4.474   7.115   1.238   7.141   4.128 
 -0.593  -1.227   0.540  -2.040  -4.627 

 -1.471   1.274  -9.379  -5.656  -2.920 
 -7.526   1.877  -7.604  -6.361   3.120 
 -8.182  -7.280  -4.450   4.023  -1.988 
  9.185   4.286  -1.661  -5.049  -7.816 
  1.443   0.477   9.299  -7.320  -2.382 

Weight Array
Weight Array[1]
  3.427   2.087   6.392 
 -6.033  -9.953  -8.236 
  2.495   1.321  -7.614 

  6.839   8.401  -5.140 
 -1.284  -9.203  -1.502 
 -8.164  -7.386   1.218 

 -2.613   6.637   9.230 
 -3.429   0.923  -2.430 
  1.522   3.107   9.012 

Weight Array[2]
 -8.001   2.406  -8.307 
 -0.383  -4.168   3.780 
 -3.991  -0.201   3.827 

 -2.226  -7.706  -4.852 
  0.159   9.134  -6.451 
  5.019  -2.150  -5.654 

 -6.483  -0.314  -3.040 
  4.735   7.073  -6.403 
  3.966  -6.356   4.520 

Output Array
  0.000   0.000   0.000 
  0.000   0.000   0.000 
  0.000   0.000   0.000 

  0.000   0.000   0.000 
  0.000   0.000   0.000 
  0.000   0.000   0.000

Do you see that all the output in the last Output Array is 0.

HiJacker · August 2, 2017, 1:38am

So, I have 3 questions:
1,How can I use a parallel loop gang to call a routine worker?
2,The final code I want in â€œTOA_conv3D()â€� is:

RESULTTYPE TOA_conv3D(
        /**/DATATYPE const ***const _input_array, Para_3D const *const _input3DPara,
            DATATYPE const ****const _filter_array, Para_3D const *const _filter3DPara,
            DATATYPE ***const _output_array, Para_3D const *const _output3DPara, CPTYPE const *const _depth_index,
            CPTYPE const *const _stride/*, DATATYPE const *const _bias*/)
{
#pragma acc loop collapse(2) independent
    for (CPTYPE output_index_Y = 0; output_index_Y < _output3DPara->HEIGHT; output_index_Y++)
    {
        for (CPTYPE output_index_X = 0; output_index_X < _output3DPara->WIDTH; output_index_X++)
        {
            //calculated every element in output array
            DATATYPE sum_temp = 0;
#pragma acc loop vector reduction(+:sum_temp) collapse(3) tile(*) independent
            for (CPTYPE filterZ = 0; filterZ < _filter3DPara->DEPTH; filterZ++)//é€�æ·±åº¦
            {
                for (CPTYPE filterY = 0; filterY < _filter3DPara->HEIGHT; filterY++)//é€�Yè½´
                {
                    for (CPTYPE filterX = 0; filterX < _filter3DPara->WIDTH; filterX++)//é€�Xè½´
                    {
                        sum_temp += _input_array[filterZ][output_index_Y * *_stride + filterY][output_index_X * *_stride + filterX] *
                                    _filter_array[*_depth_index][filterZ][_filter3DPara->HEIGHT - filterY - 1][_filter3DPara->WIDTH - filterX - 1];//æ—‹è½¬180Â°
                    }
                }
            }
                sum_temp += *_bias;
            _output_array[*_depth_index][output_index_Y][output_index_X] = sum_temp;//å†™å…¥è¾“å‡ºæ•°ç»„
        }
    }
    return SUCCEED;
}

But when compile these, the complier will say:

TOA_conv3D:
    129, Generating Tesla code
        155, #pragma acc loop seq collapse(2)
        157,   collapsed */
        162, #pragma acc loop vector /* threadIdx.x */
        164, #pragma acc loop seq
        166, #pragma acc loop worker /* threadIdx.y */
    129, Generating acc routine worker
         reduction in routine disables compute capability 2.0 kernel
    155, Loop is parallelizable
    157, Loop is parallelizable
    162, Loop is parallelizable
    164, Loop is parallelizable
    166, Loop is parallelizable

Does â€œreduction in routine disables compute capability 2.0 kernelâ€� matter? Did it mean that my reduction will not be performed?
3,do you have some advices about call a routine (with many nested-loops) from a parallel, just like what I wrote in my code? Is my code better for efficiency?

The information of GPU shows below:

CUDA Driver Version: Â Â Â Â Â Â Â Â Â 7050
NVRM version:Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â NVIDIA UNIX x86_64 Kernel ModuleÂ 352.39Â Fri Aug 14 18:09:10 PDT 2015

Device Number: Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â 0
Device Name: Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Tesla K20m
Device Revision Number:Â Â Â Â Â Â Â 3.5
Global Memory Size:Â Â Â Â Â Â Â Â Â Â Â 5032706048
Number of Multiprocessors: Â Â Â 13
Number of SP Cores:Â Â Â Â Â Â Â Â Â Â Â 2496
Number of DP Cores:Â Â Â Â Â Â Â Â Â Â Â 832
Concurrent Copy and Execution: Yes
Total Constant Memory: Â Â Â Â Â Â Â 65536
Total Shared Memory per Block: 49152
Registers per Block: Â Â Â Â Â Â Â Â Â 65536
Warp Size: Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â 32
Maximum Threads per Block: Â Â Â 1024
Maximum Block Dimensions:Â Â Â Â Â 1024, 1024, 64
Maximum Grid Dimensions: Â Â Â Â Â 2147483647 x 65535 x 65535
Maximum Memory Pitch:Â Â Â Â Â Â Â Â Â 2147483647B
Texture Alignment: Â Â Â Â Â Â Â Â Â Â Â 512B
Clock Rate:Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â 705 MHz
Execution Timeout: Â Â Â Â Â Â Â Â Â Â Â No
Integrated Device: Â Â Â Â Â Â Â Â Â Â Â No
Can Map Host Memory: Â Â Â Â Â Â Â Â Â Yes
Compute Mode:Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â default
Concurrent Kernels:Â Â Â Â Â Â Â Â Â Â Â Yes
ECC Enabled: Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Yes
Memory Clock Rate: Â Â Â Â Â Â Â Â Â Â Â 2600 MHz
Memory Bus Width:Â Â Â Â Â Â Â Â Â Â Â Â Â 320 bits
L2 Cache Size: Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â 1310720 bytes
Max Threads Per SMP: Â Â Â Â Â Â Â Â Â 2048
Async Engines: Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â 2
Unified Addressing:Â Â Â Â Â Â Â Â Â Â Â Yes
Managed Memory:Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Yes
PGI Compiler Option: Â Â Â Â Â Â Â Â Â -ta=tesla:cc35

Hope for your reply~

HiJacker · August 2, 2017, 2:46am

by the wayï¼ŒI forgot that in the first post

#define CPTYPE int

tull · August 3, 2017, 10:17pm

Hello,

I am sending this question to engineering, but I can answer question 2.

“reduction in routine disables compute capability 2.0 kernel”

means that CUDA code produced uses features not in GPUs with
Compute Capacity 2.0 or less.

OpenACC tries to support all levels of CC (1.3,2.0,3.5,5.0,6.0,)
where newer GPUs have higher CC values.

-ta=tesla:cc3x

will give you code for GPUs 3.x (or higher) but not lower, and the higher GPUs will be running the 3x level of operations.

So restrict the GPUs that will run your code. As products age, older
CC levels will no longer have default support in compiling and executables.

dave

HiJacker · August 4, 2017, 12:23pm

Thank you for your reply~~

So, that is probably a problem of complier, right?

HiJacker · August 6, 2017, 10:45am

And I have another question:
Is this relative to the version of my PGI complier?
I’m using community community now.

MatColgrove · August 7, 2017, 6:42pm

Hi HiJacker,

First, the PGI Community Edition contains all available features so that is not the issue. Also, in regards to your response to Dave’s post about using the “reduction” clause, this is a limitation of the hardware, not the compiler. Though, you’re using a K20 which does not have this restriction.

As for question #1, you’re using “routine” correctly.
For #3, using inner loop reductions is only really beneficial for longer loop bounds given that there is overhead in in creating partial reductions and the final reduction.

The main question is why you’re getting zero for you results. I noticed a couple of things which may or may not be the cause.

In the second example, I don’t see a “acc parallel loop” clause around the loop that calls “TOA_conv3D”, just a data region. Hence, you may not actually be running the code on the device but are over writing the host data. Not sure if this is just a typo in your post.

Also for the schedule used in “TOA_conv3D”, I’d explicitly add “worker” to the outer most loop. Generally, the compiler wont use a “worker” schedule unless specified by the user. This is why the outer loop is getting a “seq” schedule. Doubt this would cause your wrong answers, but worth a try.

If these don’t solve the issue, can you please send a reproducing example to PGI Customer Service (trs@pgroup.com) and ask them to send it to me? That should help me determine where the problem is.

-Mat

HiJacker · August 8, 2017, 4:15am

mkcolg:

Hi HiJacker,

First, the PGI Community Edition contains all available features so that is not the issue. Also, in regards to your response to Dave’s post about using the “reduction” clause, this is a limitation of the hardware, not the compiler. Though, you’re using a K20 which does not have this restriction.

As for question #1, you’re using “routine” correctly.
For #3, using inner loop reductions is only really beneficial for longer loop bounds given that there is overhead in in creating partial reductions and the final reduction.

The main question is why you’re getting zero for you results. I noticed a couple of things which may or may not be the cause.

In the second example, I don’t see a “acc parallel loop” clause around the loop that calls “TOA_conv3D”, just a data region. Hence, you may not actually be running the code on the device but are over writing the host data. Not sure if this is just a typo in your post.

Also for the schedule used in “TOA_conv3D”, I’d explicitly add “worker” to the outer most loop. Generally, the compiler wont use a “worker” schedule unless specified by the user. This is why the outer loop is getting a “seq” schedule. Doubt this would cause your wrong answers, but worth a try.

If these don’t solve the issue, can you please send a reproducing example to PGI Customer Service (trs@pgroup.com) and ask them to send it to me? That should help me determine where the problem is.

-Mat

I’m very sorry about a mistake in my posts before.
I’have wrote the two code into the wrong place in the posts, (the day I post this I was very tired because of repairing the code and forgot to check what I post)
The correct post have some slightly difference, that is:

#pragma acc data present(testConvLayer[0:1], \
    testConvLayer->tv_input_array[0:testConvLayer->tv_inputPara.DEPTH][0:testConvLayer->tv_inputPara.HEIGHT][0:testConvLayer->tv_inputPara.WIDTH], \
    testConvLayer->filter_weights_ACC[0:testConvLayer->tv_filter_number][0:testConvLayer->tv_filterPara.DEPTH][0:testConvLayer->tv_filterPara.HEIGHT][0:testConvLayer->tv_filterPara.WIDTH], \
    testConvLayer->tv_output_array_ACC[0:testConvLayer->tv_outputPara.DEPTH][0:testConvLayer->tv_outputPara.HEIGHT][0:testConvLayer->tv_outputPara.WIDTH])
    {
        for (CPTYPE filter_index = 0; filter_index < testConvLayer->tv_filter_number; filter_index++)
        {
            TOA_conv3D(testConvLayer->tv_input_array, &testConvLayer->tv_inputPara,
                       testConvLayer->filter_weights_ACC, &testConvLayer->tv_filterPara,
                       testConvLayer->tv_output_array_ACC, &testConvLayer->tv_outputPara, &filter_index,
                       &testConvLayer->tv_stride);
        }
    }

result in:

Output Array
999.000 999.000 999.000 
999.000 999.000 999.000 
999.000 999.000 999.000 

999.000 999.000 999.000 
999.000 999.000 999.000 
999.000 999.000 999.000

and

#pragma acc parallel loop gang present(testConvLayer[0:1], \
    testConvLayer->tv_input_array[0:testConvLayer->tv_inputPara.DEPTH][0:testConvLayer->tv_inputPara.HEIGHT][0:testConvLayer->tv_inputPara.WIDTH], \
    testConvLayer->filter_weights_ACC[0:testConvLayer->tv_filter_number][0:testConvLayer->tv_filterPara.DEPTH][0:testConvLayer->tv_filterPara.HEIGHT][0:testConvLayer->tv_filterPara.WIDTH], \
    testConvLayer->tv_output_array_ACC[0:testConvLayer->tv_outputPara.DEPTH][0:testConvLayer->tv_outputPara.HEIGHT][0:testConvLayer->tv_outputPara.WIDTH])\
    num_workers(1)
    for (CPTYPE filter_index = 0; filter_index < testConvLayer->tv_filter_number; filter_index++)
    {
        TOA_conv3D(testConvLayer->tv_input_array, &testConvLayer->tv_inputPara,
                   testConvLayer->filter_weights_ACC, &testConvLayer->tv_filterPara,
                   testConvLayer->tv_output_array_ACC, &testConvLayer->tv_outputPara, &filter_index,
                   &testConvLayer->tv_stride);
    }

result in

Output Array
  0.000   0.000   0.000 
  0.000   0.000   0.000 
  0.000   0.000   0.000 

  0.000   0.000   0.000 
  0.000   0.000   0.000 
  0.000   0.000   0.000

the way of using OpenACC code that seemed do nothing is

#pragma acc parallel loop gang present(testConvLayer[0:1], \
    testConvLayer->tv_input_array[0:testConvLayer->tv_inputPara.DEPTH][0:testConvLayer->tv_inputPara.HEIGHT][0:testConvLayer->tv_inputPara.WIDTH], \
    testConvLayer->filter_weights_ACC[0:testConvLayer->tv_filter_number][0:testConvLayer->tv_filterPara.DEPTH][0:testConvLayer->tv_filterPara.HEIGHT][0:testConvLayer->tv_filterPara.WIDTH], \
    testConvLayer->tv_output_array_ACC[0:testConvLayer->tv_outputPara.DEPTH][0:testConvLayer->tv_outputPara.HEIGHT][0:testConvLayer->tv_outputPara.WIDTH])\
    num_workers(1)
    for (CPTYPE filter_index = 0; filter_index < testConvLayer->tv_filter_number; filter_index++)
    {
        TOA_conv3D(testConvLayer->tv_input_array, &testConvLayer->tv_inputPara,
                   testConvLayer->filter_weights_ACC, &testConvLayer->tv_filterPara,
                   testConvLayer->tv_output_array_ACC, &testConvLayer->tv_outputPara, &filter_index,
                   &testConvLayer->tv_stride);
    }

not

#pragma acc data present(testConvLayer[0:1], \
    testConvLayer->tv_input_array[0:testConvLayer->tv_inputPara.DEPTH][0:testConvLayer->tv_inputPara.HEIGHT][0:testConvLayer->tv_inputPara.WIDTH], \
    testConvLayer->filter_weights_ACC[0:testConvLayer->tv_filter_number][0:testConvLayer->tv_filterPara.DEPTH][0:testConvLayer->tv_filterPara.HEIGHT][0:testConvLayer->tv_filterPara.WIDTH], \
    testConvLayer->tv_output_array_ACC[0:testConvLayer->tv_outputPara.DEPTH][0:testConvLayer->tv_outputPara.HEIGHT][0:testConvLayer->tv_outputPara.WIDTH])
    {
        for (CPTYPE filter_index = 0; filter_index < testConvLayer->tv_filter_number; filter_index++)
        {
            TOA_conv3D(testConvLayer->tv_input_array, &testConvLayer->tv_inputPara,
                       testConvLayer->filter_weights_ACC, &testConvLayer->tv_filterPara,
                       testConvLayer->tv_output_array_ACC, &testConvLayer->tv_outputPara, &filter_index,
                       &testConvLayer->tv_stride);
        }
    }

MatColgrove · August 8, 2017, 2:47pm

Ok, though I don’t see anything obvious from the code you posted. One possibility is that you’re not copying back the data from the device, but you could have an update directive later in the source.

Please send a reproducing example to PGI Customer Service (trs@pgroup.com) and ask them to send it to me. I’ll take a look and see what I can determine.

Thanks,
Mat

HiJacker · August 10, 2017, 10:00am

I have sent it, thanks~~

MatColgrove · August 10, 2017, 3:48pm

Thanks HiJacker. I took a look and the problem is that you need to update the host array after you do the computation on the device. Otherwise, you’re printing old values.

#pragma acc parallel loop gang present(testConvLayer[0:1], \
    testConvLayer->tv_input_array[0:testConvLayer->tv_inputPara.DEPTH][0:testConvLayer->tv_inputPara.HEIGHT][0:testConvLayer->tv_inputPara.WIDTH], \
    testConvLayer->filter_weights_ACC[0:testConvLayer->tv_filter_number][0:testConvLayer->tv_filterPara.DEPTH][0:testConvLayer->tv_filterPara.HEIGHT][0:testConvLayer->tv_filterPara.WIDTH], \
    testConvLayer->tv_output_array_ACC[0:testConvLayer->tv_outputPara.DEPTH][0:testConvLayer->tv_outputPara.HEIGHT][0:testConvLayer->tv_outputPara.WIDTH])\
    num_workers(1)
        for (CPTYPE filter_index = 0; filter_index < testConvLayer->tv_filter_number; filter_index++)
        {
            TOA_conv3D(testConvLayer->tv_input_array, &testConvLayer->tv_inputPara,
                       testConvLayer->filter_weights_ACC, &testConvLayer->tv_filterPara,
                       testConvLayer->tv_output_array_ACC, &testConvLayer->tv_outputPara, &filter_index,
                       &testConvLayer->tv_stride);
        }
    }
#pragma acc update host(testConvLayer->tv_output_array_ACC[0:testConvLayer->tv_outputPara.DEPTH][0:testConvLayer->tv_outputPara.HEIGHT][0:testConvLayer->tv_outputPara.WIDTH])

HiJacker · August 11, 2017, 11:42am

mkcolg:

Thanks HiJacker. I took a look and the problem is that you need to update the host array after you do the computation on the device. Otherwise, you’re printing old values.

#pragma acc parallel loop gang present(testConvLayer[0:1], \
    testConvLayer->tv_input_array[0:testConvLayer->tv_inputPara.DEPTH][0:testConvLayer->tv_inputPara.HEIGHT][0:testConvLayer->tv_inputPara.WIDTH], \
    testConvLayer->filter_weights_ACC[0:testConvLayer->tv_filter_number][0:testConvLayer->tv_filterPara.DEPTH][0:testConvLayer->tv_filterPara.HEIGHT][0:testConvLayer->tv_filterPara.WIDTH], \
    testConvLayer->tv_output_array_ACC[0:testConvLayer->tv_outputPara.DEPTH][0:testConvLayer->tv_outputPara.HEIGHT][0:testConvLayer->tv_outputPara.WIDTH])\
    num_workers(1)
        for (CPTYPE filter_index = 0; filter_index < testConvLayer->tv_filter_number; filter_index++)
        {
            TOA_conv3D(testConvLayer->tv_input_array, &testConvLayer->tv_inputPara,
                       testConvLayer->filter_weights_ACC, &testConvLayer->tv_filterPara,
                       testConvLayer->tv_output_array_ACC, &testConvLayer->tv_outputPara, &filter_index,
                       &testConvLayer->tv_stride);
        }
    }
#pragma acc update host(testConvLayer->tv_output_array_ACC[0:testConvLayer->tv_outputPara.DEPTH][0:testConvLayer->tv_outputPara.HEIGHT][0:testConvLayer->tv_outputPara.WIDTH])

Thank you for your reply, and I sent you another e-mail for this problem, because there is still something wrong. hope for your reply.

Topic		Replies	Views
Acc loop with a FORTRAN subroutine call (acc routine worker) is not parallelized! Legacy PGI Compilers	18	2130	April 6, 2022
Parallelizing a loop Legacy PGI Compilers	9	5574	March 1, 2016
Couple of questions (nested loops, loop bounds, etc.) Legacy PGI Compilers	17	25275	December 11, 2014
I cannot get ACC ROUTINE to work Legacy PGI Compilers	2	619	May 21, 2020
How to not parallelize inner loops in OpenACC ? Legacy PGI Compilers	7	3834	May 1, 2020
How to parallelize this loop... Legacy PGI Compilers	14	7980	December 18, 2012
OPENACC changes value of array Legacy PGI Compilers	12	9837	May 17, 2016
reduction within "!$acc kernels loop" ? Legacy PGI Compilers	8	5500	January 11, 2013
accelerator parallization issues Legacy PGI Compilers	18	26871	April 12, 2010
paralellize some loops with omp + acc Legacy PGI Compilers	16	7007	March 19, 2018

Puzzled problem:call routine from parallel

Related topics