Different outputs Parallel x Sequential Execution

Hi

My code has different outputs compared with the sequential execution.

Am I doing something wrong?

My code:

#include <iostream>
#include <stdlib.h>
#include <sys/time.h>
#include <time.h>
#include <math.h>

using namespace std;

int main() {
	struct timeval inicio, fim;
	int tmili;

	double * hidden_bias;
	double * output_bias;
	double ** input_weights;
	double ** output_weights;
	double ** computation;
	double ** inputs_lote;
	double ** outputs_lote;

	unsigned int qtd_entradas_saidas = 5;
	unsigned int entradas=30;
	unsigned int saidas=5;
	unsigned int altura=1000;

		hidden_bias = (double *)malloc(sizeof(double)*altura);
		output_bias = (double *)malloc(sizeof(double)*saidas);
		input_weights = (double **)malloc(sizeof(double)*altura);
		for(unsigned int i=0; i < altura; i++){
			input_weights[i] = (double *)malloc(sizeof(double)*entradas);
		}
		output_weights = (double **)malloc(sizeof(double)*saidas);
		for(unsigned int i=0; i < saidas; i++){
			output_weights[i] = (double *)malloc(sizeof(double)*altura);
		}
		computation = (double **)malloc(sizeof(double)*qtd_entradas_saidas);
		for(unsigned int i=0; i < qtd_entradas_saidas; i++){
			computation[i] = (double *)malloc(sizeof(double)*altura);
		}
		inputs_lote = (double **)malloc(sizeof(double)*qtd_entradas_saidas);
		for(unsigned int i=0; i < qtd_entradas_saidas; i++){
			inputs_lote[i] = (double *)malloc(sizeof(double)*entradas);
		}
		outputs_lote = (double **)malloc(sizeof(double)*qtd_entradas_saidas);
		for(unsigned int i=0; i < qtd_entradas_saidas; i++){
			outputs_lote[i] = (double *)malloc(sizeof(double)*saidas);
		}

		int semente = 10;

		//Inicializar a semente para rand
		srand(semente);

		//passar por parametro vetor gerado aleatoreamente
		//cout << "Gerando pesos";
		double pesos[entradas * altura + saidas * altura + altura + saidas];
		for(unsigned int i=0; i<(entradas * altura + saidas * altura + altura + saidas); i++){
			double num = (rand()/(double)RAND_MAX); //TODO: conferir
			if(rand()%2==0){
				num = num * -1;
			}
			pesos[i] = num;
			//cout << pesos[i] << ", ";
		}

		//passar por parametro vetor gerado aleatoreamente
		//cout << "\nEntradas: ";
		for(unsigned int i = 0; i<qtd_entradas_saidas; i++){
			for(unsigned int j = 0; j<entradas; j++){
				double num = (rand()/(double)RAND_MAX); //TODO: conferir
				if(rand()%2==0){
					num = num * -1;
				}
				inputs_lote[i][j] = num;
				//cout << "Entrada[" << i << "][" << j << "] = " << inputs_lote[i][j] << "\n";
			}
			//cout << "\n";
		}

		//Definindo Pesos
		unsigned int v = 0;

		for(unsigned int h = 0; h < altura; h++)
		{
			hidden_bias[h] = pesos[v++];
		}
		for(unsigned int o = 0; o < saidas; o++)
		{
			output_bias[o] = pesos[v++];
		}
		for(unsigned int h = 0; h < altura; h++)
		{
			for(unsigned int i = 0; i < entradas; i++)
			{
				input_weights[h][i] = pesos[v++];
			}
		}
		for(unsigned int o = 0; o < saidas; o++)
		{
			for(unsigned int w = 0; w < altura; w++)
			{
				output_weights[o][w] = pesos[v++];
			}
		}


		//cout << "\nExecutando a rede";

		gettimeofday(&inicio, NULL);

		#pragma acc data copyin(hidden_bias[0:altura],inputs_lote[0:qtd_entradas_saidas][0:entradas],input_weights[0:altura][0:entradas],output_bias[0:saidas],output_weights[0:saidas][0:altura]), create(computation[0:qtd_entradas_saidas][0:altura]), copyout(outputs_lote[0:qtd_entradas_saidas][0:saidas])
		#pragma acc parallel
		{
			#pragma acc loop gang(512)
			for(unsigned int p=0; p<qtd_entradas_saidas; p++){

					#pragma acc loop worker(512)
					for(unsigned int height = 0; height < altura; height++)
					{
						double sinapse = hidden_bias[height];
						#pragma acc loop seq
						for(unsigned int weight = 0; weight < entradas; weight++)
						{
							sinapse += inputs_lote[p][weight] * input_weights[height][weight];
						}
						computation[p][height] = sinapse > 0 ? 1.0 : 0.0;
					}

					#pragma acc loop worker(5)
					for(unsigned int height = 0; height < saidas; height++)
					{
						double sinapse = output_bias[height];
						#pragma acc loop seq
						for(unsigned int weight = 0; weight < altura; weight++)
						{
							sinapse += computation[p][weight] * output_weights[height][weight];
						}
						outputs_lote[p][height] = sinapse > 0 ? 1.0 : 0.0;
					}
			}
		}

		gettimeofday(&fim, NULL);

		cout << "\nOutputs: ";
		for(unsigned int i=0; i<qtd_entradas_saidas; i++){
			for(unsigned int j = 0; j<saidas; j++){
				cout << "Saida[" << i << "][" << j << "] = " << outputs_lote[i][j] << "\n";
			}
			cout << "\n";
		}

		tmili = (int) (1000 * (fim.tv_sec - inicio.tv_sec) + (fim.tv_usec - inicio.tv_usec) / 1000);

		cout << "\nTempo decorrido: E: " << entradas << ", A: " << altura << ", S: " << saidas << ", T: " << tmili;


	return 0;

}

Hi Miqueias,

You have an illegal schedule. You’re not allowed to vary the number of workers or vectors within the same parallel region. Also, “parallel” does not allow the width to be specified on the loop schedule clauses.

To fix, move the width values to the “parallel” directive’s “num_gangs” and “num_workers” clauses.

% diff -u test_11_24_14.org.cpp test_11_24_14.cpp
--- test_11_24_14.org.cpp       2014-11-24 11:04:27.244268000 -0800
+++ test_11_24_14.cpp   2014-11-24 11:07:47.005015000 -0800
@@ -109,12 +109,12 @@
        gettimeofday(&inicio, NULL);

        #pragma acc data copyin(hidden_bias[0:altura],inputs_lote[0:qtd_entradas_saidas][0:entradas],input_weights[0:altura][0:entradas],output_bias[0:saidas],output_weights[0:saidas][0:altura]), create(computation[0:qtd_entradas_saidas][0:altura]), copyout(outputs_lote[0:qtd_entradas_saidas][0:saidas])
-       #pragma acc parallel
+       #pragma acc parallel num_gangs(512) num_workers(512)
        {
-          #pragma acc loop gang(512)
+          #pragma acc loop gang
           for(unsigned int p=0; p<qtd_entradas_saidas; p++){

-                #pragma acc loop worker(512)
+                #pragma acc loop worker
                 for(unsigned int height = 0; height < altura; height++)
                 {
                    double sinapse = hidden_bias[height];
@@ -126,7 +126,7 @@
                    computation[p][height] = sinapse > 0 ? 1.0 : 0.0;
                 }

-                #pragma acc loop worker(5)
+                #pragma acc loop worker
                 for(unsigned int height = 0; height < saidas; height++)
                 {
                    double sinapse = output_bias[height];

Note that the C++ compiler should be catching this error but for some reason is letting it through (and hence why bad code is being generated). If I rewrite the code to C, then the C compiler correctly catches the error. I have added a problem report (TPR#21089) requesting that the C++ compiler catches the use of an illegal schedule.

Here’s what the error should look like:

% pgcc test_11_24_14.org.c -acc -Minfo=accel
PGC-S-0155-Clause not allowed in #pragma - gang(value) (test_11_24_14.org.c: 113)
PGC-S-0155-Clause not allowed in #pragma - worker(value) (test_11_24_14.org.c: 116)
PGC-S-0155-Clause not allowed in #pragma - worker(value) (test_11_24_14.org.c: 128)

Hope this helps,
Mat

Hi

Thanks for the reply.

I will make corrections and test the code again.

Att,
Miquéias

GOD Bless