Hello all,

As a total CUDA beginner and math enthusiast, I tried to code BuddhaBrothttps://en.wikipedia.org/wiki/Buddhabrot computation on CUDA to take advantage of the parallelization.

To compare performances, I wrote a CPU computation and a GPU one. However, while the GPU one is indeed much faster, it keeps producing images with “artifacts” (https://ibb.co/G0kY1FB) while the CPU computation yields good images (https://ibb.co/sVH992h)> The two images had the same parameters.

The code, however, is really similar between CPU and GPU compute :

```
*
Computation on the GPU :
*/
__global__
void buddha_kernel(unsigned int* matrix, int num_iter, int pass_size, int size, curandState* states) {
// matrix: output array, of length size*size
// num_iter: maximum number of iterations
// size: size of one side of the render, in pixels
// states: initalizations for the curand generator.
// pass_size : number of points to compute in one pass
int id = blockDim.x * blockIdx.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
curandState state = states[id];
for (int i = id; i < pass_size; i += stride) {
// Initializes points
float z_x = 0.f;
float z_y = 0.f;
float c_x = (curand_uniform(&state)*3) - 2;
float c_y = (curand_uniform(&state) * 2) - 1;
float * orbit = new float[2 * num_iter];
int iter = 0;
// Compute the orbit of the point
while (z_x * z_x + z_y + z_y < 4 && iter < num_iter) {
float temp = z_x * z_x - z_y * z_y + c_x;
z_y = 2 * z_x * z_y + c_y;
z_x = temp;
orbit[2 * iter] = z_x;
orbit[2 * iter + 1] = z_y;
iter++;
}
states[id] = state;
// If the point has escaped, increment its orbit in the render matrix.
if (z_x * z_x + z_y * z_y > 4) {
for (int j = 0; j < 2 * iter; j += 2) {
// compute the corresponding pixel
float p_x = orbit[j] + 2 ;
p_x = p_x / 3;
float p_y = orbit[j+1] + 1.5;
p_y = p_y / 3;
int pix_x = (int)(p_x * size);
int pix_y = (int)(p_y * size);
int coord = pix_x * size + pix_y;
// check if the pixel is indeed within the image matrix
if (coord > -1 && coord < size*size) {
matrix[coord] ++;
}
}
}
delete[] orbit;
}
}
/*
Computation on the CPU
*/
void compute_buddha(unsigned int* r, float *c, int n_c) {
// r : return array, of size SIZE*SIZE
// c : array of random samples
// n_c : size of c
int N = SIZE * SIZE;
for (int i = 0; i < n_c; i += 2) {
float c_x = c[i];
float c_y = c[i + 1];
float z_x = 0;
float z_y = 0;
int iter = 0;
float *orbit = new float[2 * MAX_ITER];
while (z_x * z_x + z_y * z_y < 4 && iter < MAX_ITER) {
float temp = z_x * z_x - z_y * z_y + c_x;
z_y = 2 * z_x * z_y + c_y;
z_x = temp;
orbit[2 * iter] = z_x;
orbit[2 * iter + 1] = z_y;
iter++;
}
if (z_x * z_x + z_y * z_y > 4) {
for (int j = 0; j < 2 * iter; j += 2) {
float p_x = orbit[j] + 2;
p_x = p_x / 3;
int pix_x = (int)(p_x * SIZE);
float p_y = orbit[j + 1] + 1.5;
p_y = p_y / 3;
int pix_y = (int)(p_y * SIZE);
int coord = SIZE * pix_x + pix_y;
if (coord < N && coord > -1)
r[coord] ++;
}
}
delete[] orbit;
}
}
```

The two methods are respectively on the GPU and on the CPU. The only difference is that random points are generated in the GPU method, but even if I compute the two methods on a same set of random points, the results remain with the ugly GPU computed image…

Where do those artifacts come from ? Is there some kind of math subtility that I do not understand in CUDA ?

Thank you very much for the help !