Hi guys,
I’m quite new to cuda and gpu computing. I’m goint through the book cuda by example and trying to run the julia example in chapter four. I’ve done some changes from the code in the book because the struct constructor was not marked with device. I compile with the -lglut flag and it compiles just fine. However, some strange things happen when I try to run the program. At my first try I got
out of memory in line 62: HANDLE_ERROR(cudaMalloc((void**)&dev_bitmap, bitmap.image_size()));
I then tried to set down the DIM definition 50 and still got the same error. I waited a little while, set DIM to 20, and suddenly it worked. I was then able to increase DIM all the way up to the original 10000. To experiment a bit I continued to increase DIM, and gave it the value 1500, I then got the same memory error. However, when I set it back down to 1000 (which worked just a minute ago) I continue to get the memory error, and get it even if I decrease DIM to 2!
Can someone help me understand this behavior? Isn’t the gpu’s memory freed, or is something entirely different causing this error?
#include "../../cuda-by-example-src/common/book.h"
#include "../../cuda-by-example-src/common/cpu_bitmap.h"
#define DIM 1000
struct cuComplex {
float r;
float i;
__device__ cuComplex(float a, float b) : r(a), i(b) {}
__device__ float magnitude2(void) {
return r*r + i*i;
}
__device__ cuComplex operator*(const cuComplex& a) {
return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
}
__device__ cuComplex operator+(const cuComplex& a) {
return cuComplex(r+a.r, i+a.i);
}
};
__device__ int julia(int x, int y)
{
const float scale = 1.5;
float jx = scale * (float) (DIM/2 - x)/(DIM/2);
float jy = scale * (float) (DIM/2 - y)/(DIM/2);
cuComplex c(-0.8, 0.156);
cuComplex a(jx, jy);
int i = 0;
for (i = 0; i < 200; i++) {
a = a*a + c;
if (a.magnitude2() > 1000)
return 0;
}
return 1;
}
__global__ void kernel(unsigned char *ptr) {
// map from blockIdx to pixel position
int x = blockIdx.x;
int y = blockIdx.y;
int offset = x + y *gridDim.x;
int juliaValue = julia(x, y);
ptr[offset*4 + 0] = 255*juliaValue;
ptr[offset*4 + 1] = 0;
ptr[offset*4 + 2] = 0;
ptr[offset*4 + 3] = 255;
}
int main(void)
{
CPUBitmap bitmap(DIM, DIM);
unsigned char *dev_bitmap;
printf("image_size = %ld\n", bitmap.image_size());
HANDLE_ERROR(cudaMalloc((void**)&dev_bitmap,
bitmap.image_size()));
dim3 grid(DIM, DIM);
kernel<<<grid,1>>>(dev_bitmap);
HANDLE_ERROR(cudaMemcpy(bitmap.get_ptr(),
dev_bitmap,
bitmap.image_size(),
cudaMemcpyDeviceToHost));
bitmap.display_and_exit();
cudaFree(dev_bitmap);
}