Can someone test my code, while I have CC 1.2?

I have the following code for Mandelbrot set calculation. After some tweaking and stuff, I managed to remove some of the errors regarding the function call. Now I have few errors about double precision operations. Because my GPU has compute capability 1.2, I cannot use double precision, cause as long as I know support for it was added in version 1.3. Therefore, if someone can compile my code in his computer, who has GPU with CC 1.3 or greater, and tell me the result I would be glad. Below you can find the error message that I get while running my program, along with the message from -Minfo, how I try to compile, and my code.

How I compile:

pgcc -acc -Minfo=accel -ta=nvidia,cc12 -lm -o mandopenacc mandopenacc.c

Messages:

mandelbrot:
34, Generating present_or_copyout(m[0:400][0:600])
Generating NVIDIA code
Double precision operations disable compute capability 1.0 kernel
35, Loop is parallelizable
36, Loop is parallelizable
Accelerator kernel generated
35, #pragma acc loop gang /* blockIdx.y /
36, #pragma acc loop gang, vector(128) /
blockIdx.x threadIdx.x */
39, Double precision operations disable compute capability 1.2 kernel
40, Double precision operations disable compute capability 1.2 kernel
41, Double precision operations disable compute capability 1.2 kernel
45, Loop carried scalar dependence for ‘newRe’ at line 47
Scalar last value needed after loop for ‘newRe’ at line 66
Loop carried scalar dependence for ‘newIm’ at line 48
Scalar last value needed after loop for ‘newIm’ at line 66
Inner sequential loop scheduled on accelerator
47, Double precision operations disable compute capability 1.2 kernel
48, Double precision operations disable compute capability 1.2 kernel
50, Double precision operations disable compute capability 1.2 kernel
51, Double precision operations disable compute capability 1.2 kernel
53, Double precision operations disable compute capability 1.2 kernel
66, Double precision operations disable compute capability 1.2 kernel
67, Double precision operations disable compute capability 1.2 kernel

Error that I get after running the program:

etairi@Progex:~/Projects/mandelbrot$ ./mandopenacc 20000 > out1.ppm
call to cuModuleLoadData returned error 200: Invalid image

Code:

// mandopenacc.c
// to compile: pgcc -acc -Minfo=accel -ta=nvidia,cc12 -o mandopenacc mandopenacc.c
// usage: ./mandopenacc <no_of_iterations> > output.ppm

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <openacc.h>

typedef struct {
	int r, g, b;
} rgb;


void mandelbrot(int niterations, rgb **m)
{
	int w = 600, h = 400, x, y, i; 
	// each iteration, it calculates: newz = oldz*oldz + p, 
	// where p is the current pixel, and oldz stars at the origin
	double pr, pi;                   // real and imaginary part of the pixel p
	double newRe, newIm, oldRe, oldIm;   // real and imaginary parts of new and old z
	double zoom = 1, moveX = -0.5, moveY = 0; // you can change these to zoom and change position
	
	//loop through every pixel
	#pragma acc kernels loop private(y,x,i,newRe,newIm,oldRe,oldIm)
	for(y = 0; y < h; y++) {
		for(x = 0; x < w; x++) {
			// calculate the initial real and imaginary part of z, 
			// based on the pixel location and zoom and position values
			pr = 1.5 * (x - w / 2) / (0.5 * zoom * w) + moveX;
	        	pi = (y - h / 2) / (0.5 * zoom * h) + moveY;
	        	newRe = newIm = oldRe = oldIm = 0; //these should start at 0,0
	        	// i will represent the number of iterations
	        	// start the iteration process
	        	for(i = 0; i < niterations; i++) {
            			// remember value of previous iteration
            			oldRe = newRe;
            			oldIm = newIm;
            			// the actual iteration, the real and imaginary part are calculated
            			newRe = oldRe * oldRe - oldIm * oldIm + pr;
            			newIm = 2 * oldRe * oldIm + pi;
            			// if the point is outside the circle with radius 2: stop
            			if((newRe * newRe + newIm * newIm) > 4) break;
        		}
        
       			if(i == niterations)
			{
				m[y][x].r = 0;
				m[y][x].g = 0;
				m[y][x].b = 0;
			}
			else
			{
				// normalized iteration count method for proper coloring
				double z = sqrt(newRe * newRe + newIm * newIm);
				int brightness = 256. * log(1.75 + i - log(log(z))) / log((double)niterations);
				m[y][x].r = brightness;
				m[y][x].g = brightness;
				m[y][x].b = 255;
			}

    		}
	}

}

int main(int argc, char *argv[])
{
	int niterations, i, j;

	if(argc != 2)
	{
		printf("Usage: %s <no_of_iterations> > output.ppm\n", argv[0]);
		exit(1);
	}

	niterations = atoi(argv[1]);

	rgb **m;
	m = malloc(400 * sizeof(rgb *));
	for(i = 0; i < 400; i++)
		m[i] = malloc(600 * sizeof(rgb));

	clock_t begin, end;
	double time_spent;
    
	begin = clock();
	mandelbrot(niterations, m);

	printf("P6\n# AUTHOR: Erkan Tairi\n");
	printf("%d %d\n255\n",600,400);

	for(i = 0; i < 400; i++) {
		for(j = 0; j < 600; j++) {
			fputc((char)m[i][j].r, stdout);
			fputc((char)m[i][j].g, stdout);
			fputc((char)m[i][j].b, stdout);
		}
	}

	end = clock();
    
	time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
	fprintf(stderr, "Elapsed time: %.2lf seconds.\n", time_spent);

	return 0;
}

Hi erkant,

Alternately, you can compile to single precision.

% cat mandlebrot.c 
// mandopenacc.c
// to compile: pgcc -acc -Minfo=accel -ta=nvidia,cc12 -o mandopenacc mandopenacc.c
// usage: ./mandopenacc <no_of_iterations> > output.ppm

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#ifdef _OPENACC
#include <openacc.h>
#endif

#ifdef SINGLE
#define REAL float
#define LOG logf
#define SQRT sqrtf
#else
#define REAL double
#define LOG log
#define SQRT sqrt
#endif

typedef struct {
  int r, g, b;
} rgb;


void mandelbrot(int niterations, rgb **m)
{
  int w = 600, h = 400, x, y, i;
  // each iteration, it calculates: newz = oldz*oldz + p,
  // where p is the current pixel, and oldz stars at the origin
  REAL pr, pi;                   // real and imaginary part of the pixel p
  REAL newRe, newIm, oldRe, oldIm;   // real and imaginary parts of new and old z
  REAL zoom = 1, moveX = -0.5, moveY = 0; // you can change these to zoom and change position
   
  //loop through every pixel
#pragma acc kernels loop private(newRe,newIm,oldRe,oldIm)
  for(y = 0; y < h; y++) {
    for(x = 0; x < w; x++) {
      // calculate the initial real and imaginary part of z,
      // based on the pixel location and zoom and position values
      pr = 1.5 * (x - w / 2) / (0.5 * zoom * w) + moveX;
      pi = (y - h / 2) / (0.5 * zoom * h) + moveY;
      newRe = newIm = oldRe = oldIm = 0; //these should start at 0,0
      // i will represent the number of iterations
      // start the iteration process
      for(i = 0; i < niterations; i++) {
	// remember value of previous iteration
	oldRe = newRe;
	oldIm = newIm;
	// the actual iteration, the real and imaginary part are calculated
	newRe = oldRe * oldRe - oldIm * oldIm + pr;
	newIm = 2 * oldRe * oldIm + pi;
	// if the point is outside the circle with radius 2: stop
	if((newRe * newRe + newIm * newIm) > 4) break;
      }
       
      if(i == niterations)
	{
	  m[y][x].r = 0;
	  m[y][x].g = 0;
	  m[y][x].b = 0;
	}
      else
	{
	  // normalized iteration count method for proper coloring
	  REAL z = SQRT(newRe * newRe + newIm * newIm);
	  int brightness = 256. * LOG(1.75 + i - LOG(LOG(z))) / LOG((REAL)niterations);
	  m[y][x].r = brightness;
	  m[y][x].g = brightness;
	  m[y][x].b = 255;
	}

    }
  }

}

int main(int argc, char *argv[])
{
  int niterations, i, j;

  if(argc != 2)
    {
      printf("Usage: %s <no_of_iterations> > output.ppm\n", argv[0]);
      exit(1);
    }

  niterations = atoi(argv[1]);

  rgb **m;
  m = malloc(400 * sizeof(rgb *));
  for(i = 0; i < 400; i++)
    m[i] = malloc(600 * sizeof(rgb));

  clock_t begin, end;
  REAL time_spent;
   
  begin = clock();
  mandelbrot(niterations, m);

  printf("P6\n# AUTHOR: Erkan Tairi\n");
  printf("%d %d\n255\n",600,400);

  for(i = 0; i < 400; i++) {
    for(j = 0; j < 600; j++) {
      fputc((char)m[i][j].r, stdout);
      fputc((char)m[i][j].g, stdout);
      fputc((char)m[i][j].b, stdout);
    }
  }

  end = clock();
   
  time_spent = (REAL)(end - begin) / CLOCKS_PER_SEC;
  fprintf(stderr, "Elapsed time: %.2lf seconds.\n", time_spent);

  return 0;
}
% pgcc mandlebrot.c -acc -Minfo=accel -V13.5 -ta=nvidia,4.2,cc12 -DSINGLE -Mfcon
mandelbrot:
     38, Generating present_or_copyout(m[0:400][0:600])
         Generating NVIDIA code
         Generating compute capability 1.2 binary
     39, Loop is parallelizable
     40, Loop is parallelizable
         Accelerator kernel generated
         39, #pragma acc loop gang /* blockIdx.y */
         40, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
     48, Loop carried scalar dependence for 'newRe' at line 50
         Scalar last value needed after loop for 'newRe' at line 68
         Loop carried scalar dependence for 'newIm' at line 51
         Scalar last value needed after loop for 'newIm' at line 68
         Inner sequential loop scheduled on accelerator
     57, Accelerator restriction: induction variable live-out from loop: i
% a.out 1024 > output.ppm
Elapsed time: 0.43 seconds.
  • Mat