Hello,
I am fairly new to OpenAcc programming and I am a student. I am trying to write a mergesort using openAcc but I am running into problems of having dependencies that prevent parallelization. Can mergesort be parallelized in the way I have written it? I also wrote a bitonic sort using openacc and that works fine without dependencies. I tried making the scalar values private and it supressed the warnings but the time remained the same on about 10M elements I am getting about 5 seconds when the bitonic version is less than 0.3 sec and the serial version without openacc is about 1 second.
#include<stdlib.h>
#include<stdio.h>
#include<stdbool.h>
#include<time.h>
#include<openacc.h>
#include<math.h>
// Utility function to find minimum of two integers
#pragma acc routine seq
int inline min(int x, int y)
{
return (x < y) ? x : y;
}
// Iteratively sort array A[low..high] using temporary array
void mergesort(double *restrict A, double *restrict temp, int low, int high)
{
int m=0, s=0;
int i=0,j=0,k=0;
// divide the array into blocks of size m
// m = [1, 2, 4, 8, 16...]
#pragma acc data copy(A[0:high]) copyin(temp[0:high], i, j, k)
{
{
{
for (m = 1; m <= high - low; m = 2 * m) {
{
#pragma acc parallel loop gang worker vector independent \
present(A[0:high], temp[0:high]) \
firstprivate(m,i,j,k)
for (s = low; s < high; s += 2 * m) {
int from = s;
int mid = s + m - 1;
int to = min(s + 2 * m - 1, high);
k = from;
i = from;
j = mid + 1;
#pragma acc loop independent
for (k = from; k <= to; k++) {
if (i <= mid && j <= to) {
if (A[i] < A[j]) {
temp[k] = A[i++];
}
else {
temp[k] = A[j++];
}
} else break;
}
#pragma acc loop independent
for(int f=i; f < high; f++){
if(f>mid) break;
temp[k++] = A[f];
}
}
#pragma acc parallel loop gang worker vector independent
for (int i = 0; i <= high; i++) {
A[i] = temp[i];
}
}
}
}
}
}
}
/* Driver program to test above functions */
int main(int argc, char **argv)
{
clock_t ts, te;
int n = atoi(argv[1]);
double *arr = (double *)malloc(n*sizeof(double));
double *tmp = (double *)malloc(n*sizeof(double));
// Init Random Seed
srand(time(NULL));
int i;
/** Initialize array with random values **/
for (i = 0; i < n; i++) {
arr[i] = rand()%n;
tmp[i] = arr[i];
}
/** End random initialization **/
ts = clock();
mergesort(arr, tmp, 0, n - 1);
te = clock();
if(check_sorted(arr,n)){
printf("Sorted\n");
} else{
printf("Not sorted\n");
}
printf("Time: %0.06fs\n", (double)(te-ts)/CLOCKS_PER_SEC);
free(arr);
free(tmp);
return 0;
}
The output during compilation is:
pgcc -O3 -acc -fast -ta=tesla:managed -Minfo=accel -lm -o mergesort_acc main.c
mergesort:
50, Generating copyin(k,temp[:high]) [if not already present]
Generating copy(A[:high]) [if not already present]
Generating copyin(i,j) [if not already present]
56, Generating present(temp[:high],A[:high])
60, Generating Tesla code
63, #pragma acc loop gang, worker(4), vector(32) /* blockIdx.x threadIdx.y threadIdx.x */
75, Loop carried scalar dependence for j at line 76
Loop carried scalar dependence for i at line 78
Loop carried scalar dependence for j at line 77
Loop carried scalar dependence for i at line 76,77
Loop carried scalar dependence for j at line 81
87, Loop carried dependence of temp-> prevents parallelization
Loop carried backward dependence of temp-> prevents vectorization
93, Generating Tesla code
94, #pragma acc loop gang, worker(4), vector(32) /* blockIdx.x threadIdx.y threadIdx.x */