Puzzled problem:call routine from parallel

Hello, I am a student from China, I just learn OpenACC an be new to the concept about “routine�

Here I have encountered a problem: When I use a parallel loop gang to call a rountine worker, the loop in routine seemed to be do nothing, and I have look for many information (from books and Internet) and change my code again and again and again to resolve this problem, but failed, here is my code:

  1. first show a correct code:
    (This is some code from other files:)
#define SUCCEED 0

#define DATATYPE float

typedef struct _Para_3D
{
    CPTYPE WIDTH;
    CPTYPE HEIGHT;
    CPTYPE DEPTH;
} Para_3D;

typedef struct _Para_2D
{
    CPTYPE WIDTH;
    CPTYPE HEIGHT;
} Para_2D;

I have these code in TestOpenACC.h

typedef struct _testconvlayer
{
    //Parameter
    Para_3D tv_inputPara;//输入图�数
    Para_3D tv_filterPara;//filter�数
    Para_3D tv_outputPara;//输出图�数
    CPTYPE tv_filter_number;//filter数�
    CPTYPE tv_stride;//步长

    //Data
    DATATYPE ****restrict filter_weights_ACC;//3维���数矩阵 √ 最终销�内存时统一处�
    DATATYPE ***restrict tv_input_array;//输入图片数组 √ -
    DATATYPE ***restrict tv_output_array_ACC;//输出三维矩阵 √ 最终销�内存时统一处�
} TestConvLayer;

RESULTTYPE TOA_conv3D(
            DATATYPE const ***const _input_array, Para_3D const *const _input3DPara,
            DATATYPE const ****const _filter_array, Para_3D const *const _filter3DPara,
            DATATYPE ***const _output_array, Para_3D const *const _output3DPara, CPTYPE const *const _depth_index,
            CPTYPE const *const _stride);

void TOA_testConv();

And these code in TestOpenACC.c

#pragma acc routine worker

RESULTTYPE TOA_conv3D(
            DATATYPE const ***const _input_array, Para_3D const *const _input3DPara,
            DATATYPE const ****const _filter_array, Para_3D const *const _filter3DPara,
            DATATYPE ***const _output_array, Para_3D const *const _output3DPara, CPTYPE const *const _depth_index,
            CPTYPE const *const _stride)
{
#pragma acc loop collapse(2) independent
    for (CPTYPE output_index_Y = 0; output_index_Y < _output3DPara->HEIGHT; output_index_Y++)
    {
        for (CPTYPE output_index_X = 0; output_index_X < _output3DPara->WIDTH; output_index_X++)
        {
            _output_array[*_depth_index][output_index_Y][output_index_X] = 999;//写入输出数组
        }
    }
    return SUCCEED;
}

void TOA_testConv()
{
    TestConvLayer *testConvLayer = malloc(sizeof(TestConvLayer));

    {//Init
        DATATYPE low = -10, high = -1.0f * low, zero = 0, one = 1, two = 2;

        testConvLayer->tv_stride = 1;
        {//Parameter of input
            testConvLayer->tv_inputPara.DEPTH = 3;
            testConvLayer->tv_inputPara.HEIGHT = 5;
            testConvLayer->tv_inputPara.WIDTH = 5;
        }
        {//Parameter of input of filter
            testConvLayer->tv_filterPara.DEPTH = 3;
            testConvLayer->tv_filterPara.HEIGHT = 3;
            testConvLayer->tv_filterPara.WIDTH = 3;
            testConvLayer->tv_filter_number = 2;
        }
        {//Parameter of output
            testConvLayer->tv_outputPara.DEPTH = 2;
            testConvLayer->tv_outputPara.HEIGHT = 3;
            testConvLayer->tv_outputPara.WIDTH = 3;
        }
        testConvLayer->tv_input_array = CPU_T_random_uniform3D(&low, &high, &testConvLayer->tv_inputPara);
        testConvLayer->filter_weights_ACC = malloc(sizeof(DATATYPE ***) * testConvLayer->tv_filter_number);
        for (CPTYPE filter_index = 0; filter_index < testConvLayer->tv_filter_number; filter_index++)
        {
            testConvLayer->filter_weights_ACC[filter_index] = CPU_T_random_uniform3D(&low, &high, &testConvLayer->tv_filterPara);
        }
        testConvLayer->tv_output_array_ACC = CPU_T_array3D(&testConvLayer->tv_outputPara, &one);
#pragma acc enter data copyin(testConvLayer[0:1])
#pragma acc enter data copyin(testConvLayer->tv_input_array[0:testConvLayer->tv_inputPara.DEPTH][0:testConvLayer->tv_inputPara.HEIGHT][0:testConvLayer->tv_inputPara.WIDTH])
#pragma acc enter data copyin(testConvLayer->filter_weights_ACC[0:testConvLayer->tv_filter_number][0:testConvLayer->tv_filterPara.DEPTH][0:testConvLayer->tv_filterPara.HEIGHT][0:testConvLayer->tv_filterPara.WIDTH])
#pragma acc enter data create(testConvLayer->tv_output_array_ACC[0:testConvLayer->tv_outputPara.DEPTH][0:testConvLayer->tv_outputPara.HEIGHT][0:testConvLayer->tv_outputPara.WIDTH])
    }

    {//show array data
        printf("--------------------show initial information--------------------\n");
        printf("Input Array\n");
        CPU_T_show3Darray(testConvLayer->tv_input_array, &testConvLayer->tv_inputPara);
        printf("Weight Array\n");
        for (CPTYPE filter_index = 0; filter_index < testConvLayer->tv_filter_number; filter_index++)
        {
            printf("Weight Array[%d]\n", filter_index + 1);
            CPU_T_show3Darray(testConvLayer->filter_weights_ACC[filter_index], &testConvLayer->tv_filterPara);
        }
        printf("Output Array(before update)\n");
        CPU_T_show3Darray(testConvLayer->tv_output_array_ACC, &testConvLayer->tv_outputPara);
    }

#pragma acc update host(testConvLayer->tv_output_array_ACC[0:testConvLayer->tv_outputPara.DEPTH][0:testConvLayer->tv_outputPara.HEIGHT][0:testConvLayer->tv_outputPara.WIDTH])
    {//show array data
        printf("--------------------show initial information--------------------\n");
        printf("Input Array\n");
        CPU_T_show3Darray(testConvLayer->tv_input_array, &testConvLayer->tv_inputPara);
        printf("Weight Array\n");
        for (CPTYPE filter_index = 0; filter_index < testConvLayer->tv_filter_number; filter_index++)
        {
            printf("Weight Array[%d]\n", filter_index + 1);
            CPU_T_show3Darray(testConvLayer->filter_weights_ACC[filter_index], &testConvLayer->tv_filterPara);
        }
        printf("Output Array(after update)\n");
        CPU_T_show3Darray(testConvLayer->tv_output_array_ACC, &testConvLayer->tv_outputPara);
    }

#pragma acc parallel loop gang present(testConvLayer[0:1], \
    testConvLayer->tv_input_array[0:testConvLayer->tv_inputPara.DEPTH][0:testConvLayer->tv_inputPara.HEIGHT][0:testConvLayer->tv_inputPara.WIDTH], \
    testConvLayer->filter_weights_ACC[0:testConvLayer->tv_filter_number][0:testConvLayer->tv_filterPara.DEPTH][0:testConvLayer->tv_filterPara.HEIGHT][0:testConvLayer->tv_filterPara.WIDTH], \
    testConvLayer->tv_output_array_ACC[0:testConvLayer->tv_outputPara.DEPTH][0:testConvLayer->tv_outputPara.HEIGHT][0:testConvLayer->tv_outputPara.WIDTH])\
    num_workers(1)
    for (CPTYPE filter_index = 0; filter_index < testConvLayer->tv_filter_number; filter_index++)
    {
        TOA_conv3D(testConvLayer->tv_input_array, &testConvLayer->tv_inputPara,
                   testConvLayer->filter_weights_ACC, &testConvLayer->tv_filterPara,
                   testConvLayer->tv_output_array_ACC, &testConvLayer->tv_outputPara, &filter_index,
                   &testConvLayer->tv_stride);
    }

    {//show array data
        printf("--------------------show initial information--------------------\n");
        printf("Input Array\n");
        CPU_T_show3Darray(testConvLayer->tv_input_array, &testConvLayer->tv_inputPara);
        printf("Weight Array\n");
        for (CPTYPE filter_index = 0; filter_index < testConvLayer->tv_filter_number; filter_index++)
        {
            printf("Weight Array[%d]\n", filter_index + 1);
            CPU_T_show3Darray(testConvLayer->filter_weights_ACC[filter_index], &testConvLayer->tv_filterPara);
        }
        printf("Output Array\n");
        CPU_T_show3Darray(testConvLayer->tv_output_array_ACC, &testConvLayer->tv_outputPara);
    }

    {//delete data in gpu
#pragma acc exit data delete(testConvLayer->tv_input_array[0:testConvLayer->tv_inputPara.DEPTH][0:testConvLayer->tv_inputPara.HEIGHT][0:testConvLayer->tv_inputPara.WIDTH])
#pragma acc exit data delete(testConvLayer->filter_weights_ACC[0:testConvLayer->tv_filter_number][0:testConvLayer->tv_filterPara.DEPTH][0:testConvLayer->tv_filterPara.HEIGHT][0:testConvLayer->tv_filterPara.WIDTH])
#pragma acc exit data delete(testConvLayer->tv_output_array_ACC[0:testConvLayer->tv_outputPara.DEPTH][0:testConvLayer->tv_outputPara.HEIGHT][0:testConvLayer->tv_outputPara.WIDTH])
#pragma acc exit data delete(testConvLayer[0:1])
    }
}

And the output is:

--------------------show initial information--------------------
Input Array
 -2.874  -6.777   4.342   4.207  -1.622 
  9.291  -7.907   7.580  -2.390  -9.619 
  1.219   3.565   0.599   3.997  -9.662 
  1.962  -4.179  -7.274   6.126   3.079 
 -4.147  -4.790   8.984   6.474  -8.906 

  2.271   7.838   1.749   4.609   2.488 
 -3.607  -8.265   5.711  -9.265   5.942 
 -5.911  -9.973   8.035  -8.332  -2.363 
  8.417   2.888  -8.798  -0.984  -3.115 
 -8.461  -9.022   2.706  -5.734   7.103 

 -4.215   0.118  -7.687  -5.231  -3.408 
 -6.592   7.040  -5.570   5.157   1.649 
  6.918  -8.450   3.385   2.629  -7.714 
 -0.673   6.718  -7.688  -2.638   8.386 
 -0.051  -4.221   1.274   1.150   4.795 

Weight Array
Weight Array[1]
  8.159   2.689   5.773 
  0.866   6.955   2.876 
  6.651  -2.927   5.189 

 -8.580   3.665   8.596 
  8.460   8.095   3.753 
  0.109   5.014   5.304 

 -6.506  -2.357   7.589 
  2.821  -5.640   9.901 
 -9.816  -7.253  -0.150 

Weight Array[2]
 -4.037   4.021  -9.000 
 -9.243   2.180   3.690 
  6.530  -6.954   0.645 

 -0.594   9.697   7.718 
 -5.405  -8.884   1.383 
 -6.809   9.576  -0.522 

  6.944  -0.314  -5.508 
  2.248   3.180   2.134 
 -0.163  -3.999   6.495 

Output Array(before update)
  1.000   1.000   1.000 
  1.000   1.000   1.000 
  1.000   1.000   1.000 

  1.000   1.000   1.000 
  1.000   1.000   1.000 
  1.000   1.000   1.000 

--------------------show initial information--------------------
Input Array
 -2.874  -6.777   4.342   4.207  -1.622 
  9.291  -7.907   7.580  -2.390  -9.619 
  1.219   3.565   0.599   3.997  -9.662 
  1.962  -4.179  -7.274   6.126   3.079 
 -4.147  -4.790   8.984   6.474  -8.906 

  2.271   7.838   1.749   4.609   2.488 
 -3.607  -8.265   5.711  -9.265   5.942 
 -5.911  -9.973   8.035  -8.332  -2.363 
  8.417   2.888  -8.798  -0.984  -3.115 
 -8.461  -9.022   2.706  -5.734   7.103 

 -4.215   0.118  -7.687  -5.231  -3.408 
 -6.592   7.040  -5.570   5.157   1.649 
  6.918  -8.450   3.385   2.629  -7.714 
 -0.673   6.718  -7.688  -2.638   8.386 
 -0.051  -4.221   1.274   1.150   4.795 

Weight Array
Weight Array[1]
  8.159   2.689   5.773 
  0.866   6.955   2.876 
  6.651  -2.927   5.189 

 -8.580   3.665   8.596 
  8.460   8.095   3.753 
  0.109   5.014   5.304 

 -6.506  -2.357   7.589 
  2.821  -5.640   9.901 
 -9.816  -7.253  -0.150 

Weight Array[2]
 -4.037   4.021  -9.000 
 -9.243   2.180   3.690 
  6.530  -6.954   0.645 

 -0.594   9.697   7.718 
 -5.405  -8.884   1.383 
 -6.809   9.576  -0.522 

  6.944  -0.314  -5.508 
  2.248   3.180   2.134 
 -0.163  -3.999   6.495 

Output Array(after update)
  0.000   0.000   0.000 
  0.000   0.000   0.000 
  0.000   0.000   0.000 

  0.000   0.000   0.000 
  0.000   0.000   0.000 
  0.000   0.000   0.000 

--------------------show initial information--------------------
Input Array
 -2.874  -6.777   4.342   4.207  -1.622 
  9.291  -7.907   7.580  -2.390  -9.619 
  1.219   3.565   0.599   3.997  -9.662 
  1.962  -4.179  -7.274   6.126   3.079 
 -4.147  -4.790   8.984   6.474  -8.906 

  2.271   7.838   1.749   4.609   2.488 
 -3.607  -8.265   5.711  -9.265   5.942 
 -5.911  -9.973   8.035  -8.332  -2.363 
  8.417   2.888  -8.798  -0.984  -3.115 
 -8.461  -9.022   2.706  -5.734   7.103 

 -4.215   0.118  -7.687  -5.231  -3.408 
 -6.592   7.040  -5.570   5.157   1.649 
  6.918  -8.450   3.385   2.629  -7.714 
 -0.673   6.718  -7.688  -2.638   8.386 
 -0.051  -4.221   1.274   1.150   4.795 

Weight Array
Weight Array[1]
  8.159   2.689   5.773 
  0.866   6.955   2.876 
  6.651  -2.927   5.189 

 -8.580   3.665   8.596 
  8.460   8.095   3.753 
  0.109   5.014   5.304 

 -6.506  -2.357   7.589 
  2.821  -5.640   9.901 
 -9.816  -7.253  -0.150 

Weight Array[2]
 -4.037   4.021  -9.000 
 -9.243   2.180   3.690 
  6.530  -6.954   0.645 

 -0.594   9.697   7.718 
 -5.405  -8.884   1.383 
 -6.809   9.576  -0.522 

  6.944  -0.314  -5.508 
  2.248   3.180   2.134 
 -0.163  -3.999   6.495 

Output Array
999.000 999.000 999.000 
999.000 999.000 999.000 
999.000 999.000 999.000 

999.000 999.000 999.000 
999.000 999.000 999.000 
999.000 999.000 999.000
  1. then I change the code to a more complex way, there is something wrong~~
    I just change the code in function “TOA_testConv()� in “TestOpenACC.c�
#pragma acc data present(testConvLayer[0:1], \
    testConvLayer->tv_input_array[0:testConvLayer->tv_inputPara.DEPTH][0:testConvLayer->tv_inputPara.HEIGHT][0:testConvLayer->tv_inputPara.WIDTH], \
    testConvLayer->filter_weights_ACC[0:testConvLayer->tv_filter_number][0:testConvLayer->tv_filterPara.DEPTH][0:testConvLayer->tv_filterPara.HEIGHT][0:testConvLayer->tv_filterPara.WIDTH], \
    testConvLayer->tv_output_array_ACC[0:testConvLayer->tv_outputPara.DEPTH][0:testConvLayer->tv_outputPara.HEIGHT][0:testConvLayer->tv_outputPara.WIDTH])
    {
        for (CPTYPE filter_index = 0; filter_index < testConvLayer->tv_filter_number; filter_index++)
        {
            TOA_conv3D(testConvLayer->tv_input_array, &testConvLayer->tv_inputPara,
                       testConvLayer->filter_weights_ACC, &testConvLayer->tv_filterPara,
                       testConvLayer->tv_output_array_ACC, &testConvLayer->tv_outputPara, &filter_index,
                       &testConvLayer->tv_stride);
        }
    }

And the output is: (The difference between this and output before is only the last six lines)

--------------------show initial information--------------------
Input Array
  1.187   2.783  -3.790  -8.796  -1.941 
  5.036   2.132  -2.366  -7.224   9.760 
 -0.783  -1.866   1.615   7.014   8.169 
  1.632   6.412   8.553   6.599   7.989 
 -6.686   5.248   5.816   5.806  -8.147 

 -2.467   5.316  -8.835  -8.757   9.422 
  9.081   2.430   2.205  -4.708   3.635 
 -9.736  -9.672  -4.234  -2.102  -6.896 
 -4.474   7.115   1.238   7.141   4.128 
 -0.593  -1.227   0.540  -2.040  -4.627 

 -1.471   1.274  -9.379  -5.656  -2.920 
 -7.526   1.877  -7.604  -6.361   3.120 
 -8.182  -7.280  -4.450   4.023  -1.988 
  9.185   4.286  -1.661  -5.049  -7.816 
  1.443   0.477   9.299  -7.320  -2.382 

Weight Array
Weight Array[1]
  3.427   2.087   6.392 
 -6.033  -9.953  -8.236 
  2.495   1.321  -7.614 

  6.839   8.401  -5.140 
 -1.284  -9.203  -1.502 
 -8.164  -7.386   1.218 

 -2.613   6.637   9.230 
 -3.429   0.923  -2.430 
  1.522   3.107   9.012 

Weight Array[2]
 -8.001   2.406  -8.307 
 -0.383  -4.168   3.780 
 -3.991  -0.201   3.827 

 -2.226  -7.706  -4.852 
  0.159   9.134  -6.451 
  5.019  -2.150  -5.654 

 -6.483  -0.314  -3.040 
  4.735   7.073  -6.403 
  3.966  -6.356   4.520 

Output Array(before update)
  1.000   1.000   1.000 
  1.000   1.000   1.000 
  1.000   1.000   1.000 

  1.000   1.000   1.000 
  1.000   1.000   1.000 
  1.000   1.000   1.000 

--------------------show initial information--------------------
Input Array
  1.187   2.783  -3.790  -8.796  -1.941 
  5.036   2.132  -2.366  -7.224   9.760 
 -0.783  -1.866   1.615   7.014   8.169 
  1.632   6.412   8.553   6.599   7.989 
 -6.686   5.248   5.816   5.806  -8.147 

 -2.467   5.316  -8.835  -8.757   9.422 
  9.081   2.430   2.205  -4.708   3.635 
 -9.736  -9.672  -4.234  -2.102  -6.896 
 -4.474   7.115   1.238   7.141   4.128 
 -0.593  -1.227   0.540  -2.040  -4.627 

 -1.471   1.274  -9.379  -5.656  -2.920 
 -7.526   1.877  -7.604  -6.361   3.120 
 -8.182  -7.280  -4.450   4.023  -1.988 
  9.185   4.286  -1.661  -5.049  -7.816 
  1.443   0.477   9.299  -7.320  -2.382 

Weight Array
Weight Array[1]
  3.427   2.087   6.392 
 -6.033  -9.953  -8.236 
  2.495   1.321  -7.614 

  6.839   8.401  -5.140 
 -1.284  -9.203  -1.502 
 -8.164  -7.386   1.218 

 -2.613   6.637   9.230 
 -3.429   0.923  -2.430 
  1.522   3.107   9.012 

Weight Array[2]
 -8.001   2.406  -8.307 
 -0.383  -4.168   3.780 
 -3.991  -0.201   3.827 

 -2.226  -7.706  -4.852 
  0.159   9.134  -6.451 
  5.019  -2.150  -5.654 

 -6.483  -0.314  -3.040 
  4.735   7.073  -6.403 
  3.966  -6.356   4.520 

Output Array(after update)
  0.000   0.000   0.000 
  0.000   0.000   0.000 
  0.000   0.000   0.000 

  0.000   0.000   0.000 
  0.000   0.000   0.000 
  0.000   0.000   0.000 

--------------------show initial information--------------------
Input Array
  1.187   2.783  -3.790  -8.796  -1.941 
  5.036   2.132  -2.366  -7.224   9.760 
 -0.783  -1.866   1.615   7.014   8.169 
  1.632   6.412   8.553   6.599   7.989 
 -6.686   5.248   5.816   5.806  -8.147 

 -2.467   5.316  -8.835  -8.757   9.422 
  9.081   2.430   2.205  -4.708   3.635 
 -9.736  -9.672  -4.234  -2.102  -6.896 
 -4.474   7.115   1.238   7.141   4.128 
 -0.593  -1.227   0.540  -2.040  -4.627 

 -1.471   1.274  -9.379  -5.656  -2.920 
 -7.526   1.877  -7.604  -6.361   3.120 
 -8.182  -7.280  -4.450   4.023  -1.988 
  9.185   4.286  -1.661  -5.049  -7.816 
  1.443   0.477   9.299  -7.320  -2.382 

Weight Array
Weight Array[1]
  3.427   2.087   6.392 
 -6.033  -9.953  -8.236 
  2.495   1.321  -7.614 

  6.839   8.401  -5.140 
 -1.284  -9.203  -1.502 
 -8.164  -7.386   1.218 

 -2.613   6.637   9.230 
 -3.429   0.923  -2.430 
  1.522   3.107   9.012 

Weight Array[2]
 -8.001   2.406  -8.307 
 -0.383  -4.168   3.780 
 -3.991  -0.201   3.827 

 -2.226  -7.706  -4.852 
  0.159   9.134  -6.451 
  5.019  -2.150  -5.654 

 -6.483  -0.314  -3.040 
  4.735   7.073  -6.403 
  3.966  -6.356   4.520 

Output Array
  0.000   0.000   0.000 
  0.000   0.000   0.000 
  0.000   0.000   0.000 

  0.000   0.000   0.000 
  0.000   0.000   0.000 
  0.000   0.000   0.000

Do you see that all the output in the last Output Array is 0.

So, I have 3 questions:
1,How can I use a parallel loop gang to call a routine worker?
2,The final code I want in “TOA_conv3D()� is:

RESULTTYPE TOA_conv3D(
        /**/DATATYPE const ***const _input_array, Para_3D const *const _input3DPara,
            DATATYPE const ****const _filter_array, Para_3D const *const _filter3DPara,
            DATATYPE ***const _output_array, Para_3D const *const _output3DPara, CPTYPE const *const _depth_index,
            CPTYPE const *const _stride/*, DATATYPE const *const _bias*/)
{
#pragma acc loop collapse(2) independent
    for (CPTYPE output_index_Y = 0; output_index_Y < _output3DPara->HEIGHT; output_index_Y++)
    {
        for (CPTYPE output_index_X = 0; output_index_X < _output3DPara->WIDTH; output_index_X++)
        {
            //calculated every element in output array
            DATATYPE sum_temp = 0;
#pragma acc loop vector reduction(+:sum_temp) collapse(3) tile(*) independent
            for (CPTYPE filterZ = 0; filterZ < _filter3DPara->DEPTH; filterZ++)//�深度
            {
                for (CPTYPE filterY = 0; filterY < _filter3DPara->HEIGHT; filterY++)//�Y轴
                {
                    for (CPTYPE filterX = 0; filterX < _filter3DPara->WIDTH; filterX++)//�X轴
                    {
                        sum_temp += _input_array[filterZ][output_index_Y * *_stride + filterY][output_index_X * *_stride + filterX] *
                                    _filter_array[*_depth_index][filterZ][_filter3DPara->HEIGHT - filterY - 1][_filter3DPara->WIDTH - filterX - 1];//旋转180°
                    }
                }
            }
                sum_temp += *_bias;
            _output_array[*_depth_index][output_index_Y][output_index_X] = sum_temp;//写入输出数组
        }
    }
    return SUCCEED;
}

But when compile these, the complier will say:

TOA_conv3D:
    129, Generating Tesla code
        155, #pragma acc loop seq collapse(2)
        157,   collapsed */
        162, #pragma acc loop vector /* threadIdx.x */
        164, #pragma acc loop seq
        166, #pragma acc loop worker /* threadIdx.y */
    129, Generating acc routine worker
         reduction in routine disables compute capability 2.0 kernel
    155, Loop is parallelizable
    157, Loop is parallelizable
    162, Loop is parallelizable
    164, Loop is parallelizable
    166, Loop is parallelizable

Does “reduction in routine disables compute capability 2.0 kernel� matter? Did it mean that my reduction will not be performed?
3,do you have some advices about call a routine (with many nested-loops) from a parallel, just like what I wrote in my code? Is my code better for efficiency?

The information of GPU shows below:

CUDA Driver Version: Â Â Â Â Â Â Â Â Â 7050
NVRM version:                 NVIDIA UNIX x86_64 Kernel Module 352.39 Fri Aug 14 18:09:10 PDT 2015

Device Number: Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â 0
Device Name: Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Tesla K20m
Device Revision Number:Â Â Â Â Â Â Â 3.5
Global Memory Size:Â Â Â Â Â Â Â Â Â Â Â 5032706048
Number of Multiprocessors: Â Â Â 13
Number of SP Cores:Â Â Â Â Â Â Â Â Â Â Â 2496
Number of DP Cores:Â Â Â Â Â Â Â Â Â Â Â 832
Concurrent Copy and Execution: Yes
Total Constant Memory: Â Â Â Â Â Â Â 65536
Total Shared Memory per Block: 49152
Registers per Block: Â Â Â Â Â Â Â Â Â 65536
Warp Size: Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â 32
Maximum Threads per Block: Â Â Â 1024
Maximum Block Dimensions:Â Â Â Â Â 1024, 1024, 64
Maximum Grid Dimensions: Â Â Â Â Â 2147483647 x 65535 x 65535
Maximum Memory Pitch:Â Â Â Â Â Â Â Â Â 2147483647B
Texture Alignment: Â Â Â Â Â Â Â Â Â Â Â 512B
Clock Rate:Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â 705 MHz
Execution Timeout: Â Â Â Â Â Â Â Â Â Â Â No
Integrated Device: Â Â Â Â Â Â Â Â Â Â Â No
Can Map Host Memory: Â Â Â Â Â Â Â Â Â Yes
Compute Mode:Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â default
Concurrent Kernels:Â Â Â Â Â Â Â Â Â Â Â Yes
ECC Enabled: Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Yes
Memory Clock Rate: Â Â Â Â Â Â Â Â Â Â Â 2600 MHz
Memory Bus Width:Â Â Â Â Â Â Â Â Â Â Â Â Â 320 bits
L2 Cache Size: Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â 1310720 bytes
Max Threads Per SMP: Â Â Â Â Â Â Â Â Â 2048
Async Engines: Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â 2
Unified Addressing:Â Â Â Â Â Â Â Â Â Â Â Yes
Managed Memory:Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Yes
PGI Compiler Option: Â Â Â Â Â Â Â Â Â -ta=tesla:cc35



Hope for your reply~

by the way,I forgot that in the first post

#define CPTYPE int

Hello,

I am sending this question to engineering, but I can answer question 2.

“reduction in routine disables compute capability 2.0 kernel”

means that CUDA code produced uses features not in GPUs with
Compute Capacity 2.0 or less.

OpenACC tries to support all levels of CC (1.3,2.0,3.5,5.0,6.0,)
where newer GPUs have higher CC values.

-ta=tesla:cc3x

will give you code for GPUs 3.x (or higher) but not lower, and the higher GPUs will be running the 3x level of operations.

So restrict the GPUs that will run your code. As products age, older
CC levels will no longer have default support in compiling and executables.

dave

Thank you for your reply~~

So, that is probably a problem of complier, right?

And I have another question:
Is this relative to the version of my PGI complier?
I’m using community community now.

Hi HiJacker,

First, the PGI Community Edition contains all available features so that is not the issue. Also, in regards to your response to Dave’s post about using the “reduction” clause, this is a limitation of the hardware, not the compiler. Though, you’re using a K20 which does not have this restriction.

As for question #1, you’re using “routine” correctly.
For #3, using inner loop reductions is only really beneficial for longer loop bounds given that there is overhead in in creating partial reductions and the final reduction.

The main question is why you’re getting zero for you results. I noticed a couple of things which may or may not be the cause.

In the second example, I don’t see a “acc parallel loop” clause around the loop that calls “TOA_conv3D”, just a data region. Hence, you may not actually be running the code on the device but are over writing the host data. Not sure if this is just a typo in your post.

Also for the schedule used in “TOA_conv3D”, I’d explicitly add “worker” to the outer most loop. Generally, the compiler wont use a “worker” schedule unless specified by the user. This is why the outer loop is getting a “seq” schedule. Doubt this would cause your wrong answers, but worth a try.

If these don’t solve the issue, can you please send a reproducing example to PGI Customer Service (trs@pgroup.com) and ask them to send it to me? That should help me determine where the problem is.

-Mat

I’m very sorry about a mistake in my posts before.
I’have wrote the two code into the wrong place in the posts
, (the day I post this I was very tired because of repairing the code and forgot to check what I post)
The correct post have some slightly difference, that is:

#pragma acc data present(testConvLayer[0:1], \
    testConvLayer->tv_input_array[0:testConvLayer->tv_inputPara.DEPTH][0:testConvLayer->tv_inputPara.HEIGHT][0:testConvLayer->tv_inputPara.WIDTH], \
    testConvLayer->filter_weights_ACC[0:testConvLayer->tv_filter_number][0:testConvLayer->tv_filterPara.DEPTH][0:testConvLayer->tv_filterPara.HEIGHT][0:testConvLayer->tv_filterPara.WIDTH], \
    testConvLayer->tv_output_array_ACC[0:testConvLayer->tv_outputPara.DEPTH][0:testConvLayer->tv_outputPara.HEIGHT][0:testConvLayer->tv_outputPara.WIDTH])
    {
        for (CPTYPE filter_index = 0; filter_index < testConvLayer->tv_filter_number; filter_index++)
        {
            TOA_conv3D(testConvLayer->tv_input_array, &testConvLayer->tv_inputPara,
                       testConvLayer->filter_weights_ACC, &testConvLayer->tv_filterPara,
                       testConvLayer->tv_output_array_ACC, &testConvLayer->tv_outputPara, &filter_index,
                       &testConvLayer->tv_stride);
        }
    }

result in:

Output Array
999.000 999.000 999.000 
999.000 999.000 999.000 
999.000 999.000 999.000 

999.000 999.000 999.000 
999.000 999.000 999.000 
999.000 999.000 999.000

and

#pragma acc parallel loop gang present(testConvLayer[0:1], \
    testConvLayer->tv_input_array[0:testConvLayer->tv_inputPara.DEPTH][0:testConvLayer->tv_inputPara.HEIGHT][0:testConvLayer->tv_inputPara.WIDTH], \
    testConvLayer->filter_weights_ACC[0:testConvLayer->tv_filter_number][0:testConvLayer->tv_filterPara.DEPTH][0:testConvLayer->tv_filterPara.HEIGHT][0:testConvLayer->tv_filterPara.WIDTH], \
    testConvLayer->tv_output_array_ACC[0:testConvLayer->tv_outputPara.DEPTH][0:testConvLayer->tv_outputPara.HEIGHT][0:testConvLayer->tv_outputPara.WIDTH])\
    num_workers(1)
    for (CPTYPE filter_index = 0; filter_index < testConvLayer->tv_filter_number; filter_index++)
    {
        TOA_conv3D(testConvLayer->tv_input_array, &testConvLayer->tv_inputPara,
                   testConvLayer->filter_weights_ACC, &testConvLayer->tv_filterPara,
                   testConvLayer->tv_output_array_ACC, &testConvLayer->tv_outputPara, &filter_index,
                   &testConvLayer->tv_stride);
    }

result in

Output Array
  0.000   0.000   0.000 
  0.000   0.000   0.000 
  0.000   0.000   0.000 

  0.000   0.000   0.000 
  0.000   0.000   0.000 
  0.000   0.000   0.000

the way of using OpenACC code that seemed do nothing is

#pragma acc parallel loop gang present(testConvLayer[0:1], \
    testConvLayer->tv_input_array[0:testConvLayer->tv_inputPara.DEPTH][0:testConvLayer->tv_inputPara.HEIGHT][0:testConvLayer->tv_inputPara.WIDTH], \
    testConvLayer->filter_weights_ACC[0:testConvLayer->tv_filter_number][0:testConvLayer->tv_filterPara.DEPTH][0:testConvLayer->tv_filterPara.HEIGHT][0:testConvLayer->tv_filterPara.WIDTH], \
    testConvLayer->tv_output_array_ACC[0:testConvLayer->tv_outputPara.DEPTH][0:testConvLayer->tv_outputPara.HEIGHT][0:testConvLayer->tv_outputPara.WIDTH])\
    num_workers(1)
    for (CPTYPE filter_index = 0; filter_index < testConvLayer->tv_filter_number; filter_index++)
    {
        TOA_conv3D(testConvLayer->tv_input_array, &testConvLayer->tv_inputPara,
                   testConvLayer->filter_weights_ACC, &testConvLayer->tv_filterPara,
                   testConvLayer->tv_output_array_ACC, &testConvLayer->tv_outputPara, &filter_index,
                   &testConvLayer->tv_stride);
    }

not

#pragma acc data present(testConvLayer[0:1], \
    testConvLayer->tv_input_array[0:testConvLayer->tv_inputPara.DEPTH][0:testConvLayer->tv_inputPara.HEIGHT][0:testConvLayer->tv_inputPara.WIDTH], \
    testConvLayer->filter_weights_ACC[0:testConvLayer->tv_filter_number][0:testConvLayer->tv_filterPara.DEPTH][0:testConvLayer->tv_filterPara.HEIGHT][0:testConvLayer->tv_filterPara.WIDTH], \
    testConvLayer->tv_output_array_ACC[0:testConvLayer->tv_outputPara.DEPTH][0:testConvLayer->tv_outputPara.HEIGHT][0:testConvLayer->tv_outputPara.WIDTH])
    {
        for (CPTYPE filter_index = 0; filter_index < testConvLayer->tv_filter_number; filter_index++)
        {
            TOA_conv3D(testConvLayer->tv_input_array, &testConvLayer->tv_inputPara,
                       testConvLayer->filter_weights_ACC, &testConvLayer->tv_filterPara,
                       testConvLayer->tv_output_array_ACC, &testConvLayer->tv_outputPara, &filter_index,
                       &testConvLayer->tv_stride);
        }
    }




Ok, though I don’t see anything obvious from the code you posted. One possibility is that you’re not copying back the data from the device, but you could have an update directive later in the source.

Please send a reproducing example to PGI Customer Service (trs@pgroup.com) and ask them to send it to me. I’ll take a look and see what I can determine.

Thanks,
Mat

I have sent it, thanks~~

Thanks HiJacker. I took a look and the problem is that you need to update the host array after you do the computation on the device. Otherwise, you’re printing old values.

#pragma acc parallel loop gang present(testConvLayer[0:1], \
    testConvLayer->tv_input_array[0:testConvLayer->tv_inputPara.DEPTH][0:testConvLayer->tv_inputPara.HEIGHT][0:testConvLayer->tv_inputPara.WIDTH], \
    testConvLayer->filter_weights_ACC[0:testConvLayer->tv_filter_number][0:testConvLayer->tv_filterPara.DEPTH][0:testConvLayer->tv_filterPara.HEIGHT][0:testConvLayer->tv_filterPara.WIDTH], \
    testConvLayer->tv_output_array_ACC[0:testConvLayer->tv_outputPara.DEPTH][0:testConvLayer->tv_outputPara.HEIGHT][0:testConvLayer->tv_outputPara.WIDTH])\
    num_workers(1)
        for (CPTYPE filter_index = 0; filter_index < testConvLayer->tv_filter_number; filter_index++)
        {
            TOA_conv3D(testConvLayer->tv_input_array, &testConvLayer->tv_inputPara,
                       testConvLayer->filter_weights_ACC, &testConvLayer->tv_filterPara,
                       testConvLayer->tv_output_array_ACC, &testConvLayer->tv_outputPara, &filter_index,
                       &testConvLayer->tv_stride);
        }
    }
#pragma acc update host(testConvLayer->tv_output_array_ACC[0:testConvLayer->tv_outputPara.DEPTH][0:testConvLayer->tv_outputPara.HEIGHT][0:testConvLayer->tv_outputPara.WIDTH])

Thank you for your reply, and I sent you another e-mail for this problem, because there is still something wrong. hope for your reply.