Program hit error 2 on CUDA API call to cudaLaunch (stack overflow?)

Hey All,

I’m hitting a really lame error. So, I adapted Shewchuk’s exact geoemetric predicates using Fast Robust Predicates for Computational Geometry

I adapted his C code to CUDA by lazily putting device in front of each function and luckily enough, I’m able to use his orient3d() function on the GPU (calling it from a global kernel).

However, when I try to use his insphere() function, I keep getting the error message mentioned in the title. I don’t have enough memory?

I’m assuming this is similar to a stack overflow in the sense that insphere() creates too many stack arguments and the memory given to each thread isn’t sufficient. So, how do I fix this?

I can post any and all code that may be required but it is a bit big.

Why do you reference “stack overflow”? What is the actual error text?

error 2 looks like this in driver_types.h:

/**
     * The API call failed because it was unable to allocate enough memory to
     * perform the requested operation.
     */
    cudaErrorMemoryAllocation             =      2,

I suppose that might come during a cudaLaunch, if for example you had a “large” device variable (i.e. static allocation) which may get allocated on the GPU at the first kernel launch of your program.

Are you certain that the error is occurring during a kernel launch, or is it possibly “left over” from a previous API call (e.g. cudaMalloc, which would return an error 2 if the requested allocation size is too large) that you didn’t adequately check for errors on ?

__global__
void compute_distance(int *pt_assoc, int *tet_assoc, point *p, tetra *mesh, float *dist, float *closest, int n) {

    int i = threadIdx.x + blockIdx.x*blockDim.x;
        
    if (i >= n) // out of bounds
        return;

    tetra *t = mesh + tet_assoc[i];

    point *a = p + t->p[0];
    point *b = p + t->p[1];
    point *c = p + t->p[2];
    point *d = p + t->p[3];

    point *e = p + pt_assoc[i];

    float pa[3], pb[3], pc[3], pd[3], pe[3];

    pa[0] = a->x;
    pa[1] = a->y;
    pa[2] = a->z;

    pb[0] = b->x;
    pb[1] = b->y;
    pb[2] = b->z;

    pc[0] = c->x;
    pc[1] = c->y;
    pc[2] = c->z;

    pd[0] = d->x;
    pd[1] = d->y;
    pd[2] = d->z;

    pe[0] = e->x;
    pe[1] = e->y;
    pe[2] = e->z;
//printf("%f, %f, %f\n", pe[0], pe[1], pe[2]);
    //float test = insphere(pa, pb, pc, pd, pe);
    printf("%f\n", orient3d(&(a->x), &(b->x), &(c->x), &(d->x)));
    printf("%f\n", insphere(pa, pb, pc, pd, pe));    

    return;
}

main.cu :

#include "structures.h"

int main(void) {

    // main point set

    thrust::device_vector<point> point_set;
    build_set(point_set);

    // mesh

    thrust::device_vector<tetra> mesh;
    mesh.reserve(8 * num_points);
    mesh.push_back(tetra(0, 1, 2, 3));

    // arrays of uninserted points and their associated
    // tetrahedra

    thrust::device_vector<int> pt_assoc, tet_assoc(num_points, 0);
    pt_assoc.reserve(num_points);

    for (int i = 1; i < num_points; ++i) {

        pt_assoc.push_back(i + 3);
    }

    // distance each point is from its associated tetrahedra

    thrust::device_vector<float> tet_dist;
    tet_dist.reserve(pt_assoc.size());    

    // the minimum distance each point is from its enclosing
    // tetrahedra's circumcenter

    thrust::device_vector<float> min_tet_dist;
    min_tet_dist.reserve(mesh.capacity());

    int tpb = 32;
    int blocks = num_points/tpb + (num_points % tpb ? 1 : 0);

    init<<< 1, 1 >>>();

    compute_distance<<< 1, 1 >>>
                    (thrust::raw_pointer_cast(pt_assoc.data()), 
                     thrust::raw_pointer_cast(tet_assoc.data()), 
                     thrust::raw_pointer_cast(point_set.data()), 
                     thrust::raw_pointer_cast(mesh.data()), 
                     thrust::raw_pointer_cast(tet_dist.data()), 
                     thrust::raw_pointer_cast(min_tet_dist.data()), 
                     pt_assoc.size());

    cudaDeviceSynchronize();

    return 0;
}

structures.h :

#include <iostream>
#include <cassert>
#include <cstring>

#include <cuda.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/scan.h>

const int verbose = 3;

const int bl = 4; // length of box
const int rl = (bl - 1) * 3; // length of fiducial tetrahedron
const int num_points = bl*bl*bl; // total number of points

struct __align__(16) point {

    float x, y, z;

    point(void) : x(0), y(0), z(0) { };

    point(float a, float b, float c) : x(a), y(b), z(c) { };
};

struct tetra {

    int p[4];
    int ngb[4];

    tetra(int a, int b, int c, int d) {

        p[0] = a;   ngb[0] = -1;
        p[1] = b;   ngb[1] = -1;
        p[2] = c;   ngb[2] = -1;
        p[3] = d;   ngb[3] = -1;     
    };
};

I’m not sure how much this code makes sense. It’s because the routines I copied write points as a 3 element array while I write mine as a structure with x,y and z members (all floats).

Here’s the thing, orient3d() is an adapted routine and it works fine. So I’m just curious why insphere() isn’t. I think it might have something to do with all the static variables declared in the routine.

If I understand the problem correctly, and if it is indeed a memory allocation problem,

… if it were me facing the problem, given that you are allocating a number of arrays/ structures, I would test the array/ structure creation one by one, to see which ones cause the issue, by first using dummy sizes for the other arrays (size of 1), or simply removing them at first

Placing a breakpoint on the kernel’s 1st line should prevent the changed sizes having detrimental effect

Okay, I found the bug!

So, here’s the insphere() routine. Well, I’ll post it below but first, I was only so hesitant to post it because it relies on A LOT of things. A ton of macros and some external functions. But here’s just the function that I’m using by itself (btw, REAL is #define REAL float) :

__device__ REAL insphere(REAL *pa, REAL *pb, REAL *pc, REAL *pd, REAL *pe) { 

  REAL aex, bex, cex, dex;
  REAL aey, bey, cey, dey;
  REAL aez, bez, cez, dez;
  REAL aexbey, bexaey, bexcey, cexbey, cexdey, dexcey, dexaey, aexdey;
  REAL aexcey, cexaey, bexdey, dexbey;
  REAL alift, blift, clift, dlift;
  REAL ab, bc, cd, da, ac, bd;
  REAL abc, bcd, cda, dab;
  REAL aezplus, bezplus, cezplus, dezplus;
  REAL aexbeyplus, bexaeyplus, bexceyplus, cexbeyplus;
  REAL cexdeyplus, dexceyplus, dexaeyplus, aexdeyplus;
  REAL aexceyplus, cexaeyplus, bexdeyplus, dexbeyplus;
  REAL det;
  REAL permanent, errbound;

  aex = pa[0] - pe[0];
  bex = pb[0] - pe[0];
  cex = pc[0] - pe[0];
  dex = pd[0] - pe[0];
  aey = pa[1] - pe[1];
  bey = pb[1] - pe[1];
  cey = pc[1] - pe[1];
  dey = pd[1] - pe[1];
  aez = pa[2] - pe[2];
  bez = pb[2] - pe[2];
  cez = pc[2] - pe[2];
  dez = pd[2] - pe[2];

  aexbey = aex * bey;
  bexaey = bex * aey;
  ab = aexbey - bexaey;
  bexcey = bex * cey;
  cexbey = cex * bey;
  bc = bexcey - cexbey;
  cexdey = cex * dey;
  dexcey = dex * cey;
  cd = cexdey - dexcey;
  dexaey = dex * aey;
  aexdey = aex * dey;
  da = dexaey - aexdey;

  aexcey = aex * cey;
  cexaey = cex * aey;
  ac = aexcey - cexaey;
  bexdey = bex * dey;
  dexbey = dex * bey;
  bd = bexdey - dexbey;

  abc = aez * bc - bez * ac + cez * ab;
  bcd = bez * cd - cez * bd + dez * bc;
  cda = cez * da + dez * ac + aez * cd;
  dab = dez * ab + aez * bd + bez * da;

  alift = aex * aex + aey * aey + aez * aez;
  blift = bex * bex + bey * bey + bez * bez;
  clift = cex * cex + cey * cey + cez * cez;
  dlift = dex * dex + dey * dey + dez * dez;

  det = (dlift * abc - clift * dab) + (blift * cda - alift * bcd);

  aezplus = Absolute(aez);
  bezplus = Absolute(bez);
  cezplus = Absolute(cez);
  dezplus = Absolute(dez);
  aexbeyplus = Absolute(aexbey);
  bexaeyplus = Absolute(bexaey);
  bexceyplus = Absolute(bexcey);
  cexbeyplus = Absolute(cexbey);
  cexdeyplus = Absolute(cexdey);
  dexceyplus = Absolute(dexcey);
  dexaeyplus = Absolute(dexaey);
  aexdeyplus = Absolute(aexdey);
  aexceyplus = Absolute(aexcey);
  cexaeyplus = Absolute(cexaey);
  bexdeyplus = Absolute(bexdey);
  dexbeyplus = Absolute(dexbey);
  permanent = ((cexdeyplus + dexceyplus) * bezplus
               + (dexbeyplus + bexdeyplus) * cezplus
               + (bexceyplus + cexbeyplus) * dezplus)
            * alift
            + ((dexaeyplus + aexdeyplus) * cezplus
               + (aexceyplus + cexaeyplus) * dezplus
               + (cexdeyplus + dexceyplus) * aezplus)
            * blift
            + ((aexbeyplus + bexaeyplus) * dezplus
               + (bexdeyplus + dexbeyplus) * aezplus
               + (dexaeyplus + aexdeyplus) * bezplus)
            * clift
            + ((bexceyplus + cexbeyplus) * aezplus
               + (cexaeyplus + aexceyplus) * bezplus
               + (aexbeyplus + bexaeyplus) * cezplus)
            * dlift;
  errbound = isperrboundA * permanent;
  if ((det > errbound) || (-det > errbound)) {
    return det;
  } 

  return 0;
  //return insphereadapt(pa, pb, pc, pd, pe, permanent);
}

And here’s the code that matters at the very end of it :

if ((det > errbound) || (-det > errbound)) {
    return det;
  } 

  return 0;
  //return insphereadapt(pa, pb, pc, pd, pe, permanent);

Returning insphereadapt() is what crashes my code. Using return 0 actually makes it run! I think what’s happening is, I have too many function calls so the call stack is overflowed or is just simply not large enough. Because it’s all statically allocated, the size is known at compilation time so it would automatically give a failure.

I think this just means that if my original insphere() result isn’t within error bounds, I need to call all the other functions separately which is a small modification, to say the least. It’s easier than writing my own robust routines lol.