Small update: I managed to improve the performance of the code by roughly x20 through some simple changes.

In the first place I merged the double calls of rtPotentialIntersection and rtReportIntersection in the while-loop into a single occurence. This lowered the render time from 100ms to roughly 50ms.

In the second step I completely removed both calls from the loop and got another improvement from 50ms down to ~5ms which is on the acceptable side, but still ~10x slower compared to a similar rending in pure Cuda (0.1 - 0.5 ms on the same image resolution and similar data size!). All the Timings are related to the RTX2060 Card.

So obviously optix does not like rtPotentialIntersection and/or rtReportIntersection calls in loops with high duty - at least for the combination of cuda 10.1 and optix 6.0.

Modified intersection routine:

```
RT_PROGRAM void intersect(int primIdx)
{
// Step 1 is setup (handled in CPU code)
// Step 2 - transform ray into grid space and compute ray-box intersection
float3 t0 = (boxmin - ray.origin) / ray.direction;
float3 t1 = (boxmax - ray.origin) / ray.direction;
float3 near = fminf(t0, t1);
float3 far = fmaxf(t0, t1);
float tnear = fmaxf(near);
float tfar = fminf(far);
if (tnear >= tfar)
return;
if (tfar < 1.e-6f)
return;
tnear = max(tnear, 0.f);
tfar = min(tfar, ray.tmax);
// Step 3
uint2 nnodes;
nnodes.x = heights.size().x;
nnodes.y = heights.size().y;
float3 L = (ray.origin + tnear * ray.direction - boxmin) * inv_cellsize;
int Lu = min(__float2int_rz(L.x), nnodes.x - 2);
int Lv = min(__float2int_rz(L.z), nnodes.y - 2);
// Step 4
float3 D = ray.direction * inv_cellsize;
int diu = D.x > 0 ? 1 : -1;
int div = D.z > 0 ? 1 : -1;
int stopu = D.x > 0 ? (int)(nnodes.x) - 1 : -1;
int stopv = D.z > 0 ? (int)(nnodes.y) - 1 : -1;
// Step 5
float dtdu = abs(cellsize.x / ray.direction.x);
float dtdv = abs(cellsize.z / ray.direction.z);
// Step 6
float far_u = (D.x > 0.0f ? Lu + 1 : Lu) * cellsize.x + boxmin.x;
float far_v = (D.z > 0.0f ? Lv + 1 : Lv) * cellsize.z + boxmin.z;
// Step 7
float tnext_u = (far_u - ray.origin.x) / ray.direction.x;
float tnext_v = (far_v - ray.origin.z) / ray.direction.z;
// Step 8
float yenter = ray.origin.y + tnear * ray.direction.y;
float3 n, n2, p00;
bool hit = false;
float t, beta, gamma;
while (tnear < tfar){
float texit = min(tnext_u, tnext_v);
float yexit = ray.origin.y + texit * ray.direction.y;
// Step 9
float d00 = heights[make_uint2(Lu, Lv)];
float d01 = heights[make_uint2(Lu, Lv + 1)];
float d10 = heights[make_uint2(Lu + 1, Lv)];
float d11 = heights[make_uint2(Lu + 1, Lv + 1)];
float datamin = min(min(d00, d01), min(d10, d11));
float datamax = max(max(d00, d01), max(d10, d11));
float ymin = min(yenter, yexit);
float ymax = max(yenter, yexit);
if (ymin <= datamax && ymax >= datamin) {
//float3
p00 = make_float3(boxmin.x + Lu*cellsize.x, d00, boxmin.z + Lv*cellsize.z);
float3 p11 = make_float3(p00.x + cellsize.x, d11, p00.z + cellsize.z);
float3 p01 = make_float3(p00.x, d01, p11.z);
float3 p10 = make_float3(p11.x, d10, p00.z);
//MOD:
float t2, beta2, gamma2;
bool ta = intersect_triangle(ray, p00, p11, p10, n, t, beta, gamma);
bool tb = intersect_triangle(ray, p00, p01, p11, n2, t2, beta2, gamma2);
if (ta && tb){
hit = true;
if (t < t2){
break; //keep t, beta, gamma
}
else
{ //copy close t, beta, gamma
t = t2;
beta = beta2;
gamma = gamma2;
n = n2;
break;
}
}
if (tb){
hit = true;
//copy close t, beta, gamma
t = t2;
beta = beta2;
gamma = gamma2;
n = n2;
break;
}
if (ta){
hit = true; //just keep t, beta, gamma and quit loop
break;
}
}
// Step 11
yenter = yexit;
if (tnext_u < tnext_v){
Lu += diu;
if (Lu == stopu)
break;
tnear = tnext_u;
tnext_u += dtdu;
}
else {
Lv += div;
if (Lv == stopv)
break;
tnear = tnext_v;
tnext_v += dtdv;
}
}
if (hit){
if (rtPotentialIntersection(t)) {
geometric_normal = normalize(n);
shading_normal = computeNormal(Lu, Lv, ray.origin + t*ray.direction);
refine_and_offset_hitpoint(ray.origin + t*ray.direction, ray.direction,
geometric_normal, p00,
back_hit_point, front_hit_point);
if (rtReportIntersection(0)) {
return;
}
}
}
}
```

Edit: I also compiled this on the MX130 card notebook (where the sample crashed previously) and it now works too!