Hello, I’m CUDA programming beginner. I’m making fractal(mandelbrot set) computing program by using CUDA with PyCUDA. But, I found #pragma unroll 5 is slower than code unrolled by me. Here is my test code.
Environment: i7-5820K, GTX970, Ubuntu 16.04.3, CUDA 9.0.176, Python 3.5.2, PyCUDA 2017.1.1
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import numpy as np
import matplotlib.cm as cm
import colorcet as cc
from matplotlib.colors import Normalize
from PIL import Image
from time import time
cu = SourceModule("""
__global__ void calcMandelbrot(double *matrix, int *result) {
int idx_x = blockIdx.x * blockDim.x + threadIdx.x;
int idx_y = blockIdx.y * blockDim.y + threadIdx.y;
double a = matrix[0] + (float)idx_x * matrix[2];
double b = matrix[1] + (float)idx_y * matrix[2];
double an = a, bn = b;
double aan = an * an, bbn = bn * bn;
#pragma unroll 5
for (int i = 0; i < 1000; i++) {
bn = 2 * an * bn + b;
an = aan - bbn + a;
aan = an * an;
bbn = bn * bn;
if (an * an + bn * bn > 4.0f) {
result[idx_x * gridDim.y * blockDim.y + idx_y] = i;
break;
}
}
}
""")
def calcMandelbrot(start, size, unit):
s = time()
# change size to multiple of 16(bigger or equal to prior size). Because of threads per block is 16
size = ((size + 255) / 256).astype(np.uint32) * 256
result = np.empty((size[0], size[1]), np.int32)
func = cu.get_function("calcMandelbrot")
func(cuda.In(np.array([start[0] - size[0] / 2 * unit,
start[1] - size[1] / 2 * unit, unit], np.float64)),
cuda.Out(result),
block=(16, 16, 1), grid=(int(size[0] / 16), int(size[1] / 16)))
result = np.transpose(result)
print("%f s" % (time() - s))
calcMandelbrot(np.array([0.39514148, -0.18187805]), np.array([800, 800]), 0.00015650431982753305)
It takes about 0.05 s. Also, I tried not only #pragma unroll 5 but also 10, 100, etc, but they are similar.
But, the code unrolled by me is faster than ‘#pragma unroll’ code.
Here is my unrolled code.
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import numpy as np
import matplotlib.cm as cm
import colorcet as cc
from matplotlib.colors import Normalize
from PIL import Image
from time import time
cu = SourceModule("""
__global__ void calcMandelbrot(double *matrix, int *result) {
int idx_x = blockIdx.x * blockDim.x + threadIdx.x;
int idx_y = blockIdx.y * blockDim.y + threadIdx.y;
double a = matrix[0] + (float)idx_x * matrix[2];
double b = matrix[1] + (float)idx_y * matrix[2];
double an = a, bn = b;
double aan = an * an, bbn = bn * bn;
for (int i = 0; i < 200; i+=5) {
// iteration 1
bn = 2 * an * bn + b;
an = aan - bbn + a;
aan = an * an;
bbn = bn * bn;
if (an * an + bn * bn > 4.0f) {
result[idx_x * gridDim.y * blockDim.y + idx_y] = i;
break;
}
// iteration 2
bn = 2 * an * bn + b;
an = aan - bbn + a;
aan = an * an;
bbn = bn * bn;
if (an * an + bn * bn > 4.0f) {
result[idx_x * gridDim.y * blockDim.y + idx_y] = i + 1;
break;
}
// iteration 3
bn = 2 * an * bn + b;
an = aan - bbn + a;
aan = an * an;
bbn = bn * bn;
if (an * an + bn * bn > 4.0f) {
result[idx_x * gridDim.y * blockDim.y + idx_y] = i + 2;
break;
}
// iteration 4
bn = 2 * an * bn + b;
an = aan - bbn + a;
aan = an * an;
bbn = bn * bn;
if (an * an + bn * bn > 4.0f) {
result[idx_x * gridDim.y * blockDim.y + idx_y] = i + 3;
break;
}
// iteration 5
bn = 2 * an * bn + b;
an = aan - bbn + a;
aan = an * an;
bbn = bn * bn;
if (an * an + bn * bn > 4.0f) {
result[idx_x * gridDim.y * blockDim.y + idx_y] = i + 4;
break;
}
}
}
""")
def calcMandelbrot(start, size, unit):
s = time()
# change size to multiple of 16(bigger or equal to prior size). Because of threads per block is 16
size = ((size + 255) / 256).astype(np.uint32) * 256
result = np.empty((size[0], size[1]), np.int32)
func = cu.get_function("calcMandelbrot")
func(cuda.In(np.array([start[0] - size[0] / 2 * unit,
start[1] - size[1] / 2 * unit, unit], np.float64)),
cuda.Out(result),
block=(16, 16, 1), grid=(int(size[0] / 16), int(size[1] / 16)))
result = np.transpose(result)
print("%f s" % (time() - s))
calcMandelbrot(np.array([0.39514148, -0.18187805]), np.array([800, 800]), 0.00015650431982753305)
It takes about 0.02 s, much faster. Why does this happen?