Unified Memory Behavior...

Hey guys,

I have this code :

int *nt = 0;
    cudaMallocManaged(&nt, sizeof(*nt));
    *nt = 1;

    {
        int *insert = 0, *pfx_sum = 0;
        cudaMalloc(&insert, *nt*sizeof(*insert));
        cudaMalloc(&pfx_sum, *nt*sizeof(*pfx_sum));

        thrust::inclusive_scan(
            thrust::device_pointer_cast(insert), 
            thrust::device_pointer_cast(insert) + *nt, 
            thrust::device_pointer_cast(pfx_sum)
        );
    }

And this just flat out fails but ONLY because of this line

thrust::device_pointer_cast(insert) + *nt,

and my question is, why?

I got it to work by adding a crude fix :

int tmp = *nt;

/* ... */
thrust::device_pointer_cast(insert) + tmp,

But why can’t I dereference unified memory in an argument list? Is it because it just freaks out the Thrust libraries?

I think you’ll get better responses if you include a complete, compilable code, and also be explicit about the error you are receiving rather than “flat out fails”. Is it a compile error? A runtime error? What is the actual error output?

For me, your code compiles and runs without error, as follows:

$ cat t480.cu
#include <thrust/device_ptr.h>
#include <thrust/scan.h>

int main(){

  int *nt = 0;
  cudaMallocManaged(&nt, sizeof(*nt));
  *nt = 1;
  {
     int *insert = 0, *pfx_sum = 0;
     cudaMalloc(&insert, *nt*sizeof(*insert));
     cudaMalloc(&pfx_sum, *nt*sizeof(*pfx_sum));
     thrust::inclusive_scan(
       thrust::device_pointer_cast(insert),
       thrust::device_pointer_cast(insert) + *nt,
       thrust::device_pointer_cast(pfx_sum)
       );
  }
}
$ nvcc -arch=sm_35 -o t480 t480.cu
$ cuda-memcheck ./t480
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$

It would be nice if you provided a complete code, like above, that demonstrates the failure.

I suspect that the problem you are having may be due to the requirement to issue a cudaDeviceSynchronize(), after a kernel call, whenever trying to use any data created with cudaMallocManaged().

Thrust won’t issue this cudaDeviceSynchronize() for you.

For example, suppose I make this seemingly innocuous change to your code:

#include <thrust/device_ptr.h>
#include <thrust/scan.h>
#include <thrust/copy.h>
#include <iostream>

int main(){

  int *nt = 0;
  cudaMallocManaged(&nt, sizeof(*nt));
  *nt = 1;
  {
     int *insert = 0, *pfx_sum = 0;
     cudaMalloc(&insert, *nt*sizeof(*insert));
     cudaMemset(insert, 1, *nt*sizeof(int));
     cudaMalloc(&pfx_sum, *nt*sizeof(*pfx_sum));
     cudaMemset(pfx_sum, 0, *nt*sizeof(int));
     thrust::inclusive_scan(
       thrust::device_pointer_cast(insert),
       thrust::device_pointer_cast(insert) + *nt,
       thrust::device_pointer_cast(pfx_sum)
       );
     thrust::copy(thrust::device_pointer_cast(insert), thrust::device_pointer_cast(insert) + *nt, std::ostream_iterator<int>(std::cout, ","));
     std::cout << std::endl;
  }
}

All I’ve done is add the necessary bits and pieces so that I can actually observe the result of the scan operation. With the above code which compiles cleanly, I will get a runtime failure (seg fault), and if I run with cuda-memcheck, I get this instructive message:

" The application may have hit an error when dereferencing Unified Memory from the host. Please rerun the application under cuda-gdb or Nsight Eclipse Edition to catch host side errors."

If I add a cudaDeviceSynchronize() after the thrust::inclusive_scan call, but before the thrust::copy operation, then all is well.

I suspect something similar is going on in your case, and that you excerpted your example from another piece of code that was failing. Unfortunately your excerpted code does not fail by itself.

If you study the requirements for accessing managed data:

http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-coherency-hd

and carefully consider how this may apply to thrust, which is launching cuda kernels “under the hood”, you’ll probably find the root cause in your code. Thrust is not inserting the cudaDeviceSynchronize() calls that are necessary to make managed data accessible in host code. You have to do this yourself, at appropriate points.

Initially, nt is available to host code. Although you don’t show it, if you launch a kernel or a thrust call after the managed allocation of nt, then nt is no longer usable in host code. At that point, if you issue your thrust::inclusive_scan call, with one of the parameters dependent on nt, you’ll get a seg fault.

On the other hand, your “fix” grabs a copy of nt into a host accessible (non-managed) variable, before any CUDA kernel activity. At that point, I can launch CUDA kernels or thrust calls, making nt inaccessible, but my thrust::inclusive_scan call still works, because it does not depend on accessing nt.

Lol the code’s a bit long but if you wanted a complete example (I love posting code 'cause I feel so fancy) I can post it. This is good because I also wanted to see if maybe people could help me optimize my compute_distance kernel. I’m really new to CUDA and I recently got a 750 Ti so I’m not sure how to use it to its full potential yet.

Basically, I’m doing a lot of math and the main function is regulus() down at the bottom. I think you’re right though, bob, in the sense that it’s probably a synchronization error.

#include "structures.h"
#include "predicates.h"

__host__ __device__
inline unsigned FloatFlip(unsigned f)
{
	unsigned mask = -int(f >> 31) | 0xffffffff; //0x80000000;
	return f ^ mask;
}

__host__ __device__
inline unsigned IFloatFlip(unsigned f)
{
	unsigned mask = ((f >> 31) - 1) | 0xffffffff; //0x80000000;
	return f ^ mask;
}
__device__
inline float two_by_two_determinant(float *m )
{
    return m[0] * m[3] - m[1] * m[2];
}


__device__
inline float three_by_three_determinant(float *m)
{
    float u[4] = { m[4], m[5], m[7], m[8] };
    float v[4] = { m[3], m[5], m[6], m[8] };
    float w[4] = { m[3], m[4], m[6], m[7] };

    return m[0] * two_by_two_determinant(u) -
           m[1] * two_by_two_determinant(v) +
           m[2] * two_by_two_determinant(w);
}

__device__
void three_by_three_inverse(float *m)
{
    float a0[4] = { m[4], m[5], m[7], m[8] };
    float a1[4] = { m[2], m[1], m[8], m[7] };
    float a2[4] = { m[1], m[2], m[4], m[5] };
    float a3[4] = { m[5], m[3], m[8], m[6] };
    float a4[4] = { m[0], m[2], m[6], m[8] };
    float a5[4] = { m[2], m[0], m[5], m[3] };
    float a6[4] = { m[3], m[4], m[6], m[7] };
    float a7[4] = { m[1], m[0], m[7], m[6] };
    float a8[4] = { m[0], m[1], m[3], m[4] };

    float divisor = 1 / three_by_three_determinant(m);

    float n[9] = { two_by_two_determinant(a0),
                   two_by_two_determinant(a1),
                   two_by_two_determinant(a2),
                   two_by_two_determinant(a3),
                   two_by_two_determinant(a4),
                   two_by_two_determinant(a5),
                   two_by_two_determinant(a6),
                   two_by_two_determinant(a7),
                   two_by_two_determinant(a8)
                 };

    #pragma unroll
    for (int i = 0; i < 9; ++i) 
    {
        m[i] = n[i] * divisor;
    }
}
/*
__host__ __device__
bool operator()( const unsigned x )
{
    return (bool ) x;
}
*/

__device__
void print_tetra(tetra *t)
{
    printf("t : %d, %d, %d, %d\n", t->p[0], t->p[1], t->p[2], t->p[3]);
}

__global__
void fracture(point *points, tetra *mesh, int *pa, int *insert, int *pfx_sum, int *nt)
{
    int i = threadIdx.x + blockIdx.x * blockDim.x;

    if (i >= *nt)
        return;

    if (!insert[i])
        return;

    int offset = (pfx_sum[i] - 1) * 3;

    tetra *t = mesh + i;
    int p = pa[t->insert_index];
    pa[t->insert_index] = -1;
    int *vtx = t->p;

    new(t + offset + 1) tetra(vtx[0], vtx[2], vtx[3], p);
    new(t + offset + 2) tetra(vtx[0], vtx[3], vtx[1], p);
    new(t + offset + 3) tetra(vtx[0], vtx[1], vtx[2], p);
    new(t) tetra(vtx[3], vtx[2], vtx[1], p);

    atomicAdd(nt, 3);
printf("offset[%d] : %d\n", i, pfx_sum[i]);
    print_tetra(t);
    print_tetra(t + 1);
    print_tetra(t + 2);
    print_tetra(t + 3);
}

__global__
void compare_distance(point *points, tetra *mesh, int *pa, int *ta, float *dist, int *insert, int n)
{
    int i = threadIdx.x + blockIdx.x * blockDim.x;

    if (i >= n)
        return;

    if (pa[i] < 0)
        return;

    union { uint uint_dist; float flt_dist; };

    tetra *t = mesh + ta[i];

    uint_dist = IFloatFlip(t->min_dist);

    if (flt_dist == dist[i])
    {
        atomicMin(&(t->insert_index), i);
        atomicMax(insert + ta[i], 1);

        //printf("%d : %.00f\n", t->insert_index, dist[i]);
    }
}

__global__
void compute_distance(point *points, tetra *mesh, int *pa, int *ta, float *dist, int n, PredicateInfo *preds)
{
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    
    if (i >= n)
        return;

    if (pa[i] < 0 )
        return;

    tetra *t = mesh + ta[i];
    //print_tetra(t);

    float *a = (points + t->p[0])->p;
    float *b = (points + t->p[1])->p;
    float *c = (points + t->p[2])->p;
    float *d = (points + t->p[3])->p;

    float *e = (points + pa[i])->p;

    float A[9] = { 0 };

    float u[3] = { b[0] - a[0], b[1] - a[1], b[2] - a[2] };
    float v[3] = { c[0] - a[0], c[1] - a[1], c[2] - a[2] };
    float w[3] = { d[0] - a[0], d[1] - a[1], d[2] - a[2] };

    for (int j = 0; j < 3; ++j)
    {
        int offset = 3 * j;

        A[offset + 0] = u[j];
        A[offset + 1] = v[j];
        A[offset + 2] = w[j];
    }

    three_by_three_inverse(A);

    float B[3] = { e[0] - a[0], e[1] - a[1], e[2] - a[2] };
    float x[3] = { A[0] * B[0] + A[1] * B[1] + A[2] * B[2], 
                   A[3] * B[0] + A[4] * B[1] + A[5] * B[2],
                   A[6] * B[0] + A[7] * B[1] + A[8] * B[2]
                 };

    if (x[0] > 0 && x[1] > 0 && x[2] > 0 && x[0] + x[1] + x[2] <  1)
    {
        dist[i] = insphere(*preds, a, b, c, d, e);
        atomicMin(&(t->min_dist), FloatFlip((unsigned& ) dist[i]));
/*
        union
        {
            uint uint_dist;
            float flt_dist;
        };

        uint_dist = IFloatFlip(t->min_dist);
        printf("%.00f\n", flt_dist);*/
    }
    else
        dist[i] = FLT_MAX;

    //printf("%d : %.00f\n", i, dist[i]);
}

__global__
void print_pa(int *pa, int n)
{
    int i = threadIdx.x + blockIdx.x * blockDim.x;

    if (i >= n)
        return;

    printf("%d\n", pa[i]);
}

void regulus(point *points, tetra *mesh, int np)
{
    int tpb = 512;
    int blk = np / tpb + (np % tpb ? 1 : 0);

    PredicateInfo host_preds;
    initPredicate(host_preds);

    PredicateInfo *preds;
    cudaMalloc(&preds, sizeof(*preds));
    cudaMemcpy(preds, &host_preds, sizeof(*preds), cudaMemcpyHostToDevice );

    int *pa = 0, *ta = 0;
    cudaMalloc(&pa, np*sizeof(*pa));
    cudaMalloc(&ta, np*sizeof(*ta));

    int *nt = 0;
    cudaMallocManaged(&nt, sizeof(*nt));
    *nt = 1;

    for (int i = 0; i < np; ++i)
    {
        int tmp = 4 + i;
        cudaMemcpy(pa + i, &tmp, sizeof(tmp), cudaMemcpyHostToDevice);
    }

    cudaMemset(ta, 0, np*sizeof(*ta));

    for (int i = 0; i < 2; ++i)
    {
        printf("Iterative index : %d\n\n", i);

        float *dist = 0;
        cudaMalloc(&dist, np*sizeof(*dist));

        int *insert = 0, *pfx_sum = 0;
        cudaMalloc(&insert, *nt*sizeof(*insert));
        cudaMalloc(&pfx_sum, *nt*sizeof(*pfx_sum));
        cudaMemset(insert, 0, *nt*sizeof(*insert));
        cudaMemset(pfx_sum, 0, *nt*sizeof(*insert));

        int tmp = *nt;

        compute_distance<<< blk, tpb >>>(points, mesh, pa, ta, dist, np, preds);
        compare_distance<<< blk, tpb >>>(points, mesh, pa, ta, dist, insert, np);
        thrust::inclusive_scan(
            thrust::device_pointer_cast(insert), 
            thrust::device_pointer_cast(insert) + tmp, 
            thrust::device_pointer_cast(pfx_sum)
        );
        fracture<<< tmp / tpb + (tmp % tpb ? 1 : 0), tpb >>>(points, mesh, pa, insert, pfx_sum, nt);
        cudaDeviceSynchronize();

        cudaFree(pfx_sum);
        cudaFree(insert);
        cudaFree(dist);
    }

    cudaFree(nt);
    cudaFree(ta);
    cudaFree(pa);
    cudaFree(preds);   
}

The ideal case is to post the shortest possible code that reproduces the error.

No, I’m not asking for your complete code. I’m asking for a complete code, that demonstrates the failure. If you reduce the code down to the minimal amount that still reproduces the failure, it’s quite likely you’ll discover the problem yourself, learn a useful debugging technique, and won’t need to wait for an answer from someone else about it.

If the request is not clear, take a look at:

[url]http://sscce.org/[/url]

The code you’ve posted has your fix already applied.

My guess is that reverting your code to the failing case, and then putting a cudaDeviceSynchronize() call before the thrust::inclusive_scan call, would clear up the issue.

Thank you for your help, txbob.

In my defense about posting long code, I would have to open gedit and highlight things and I was too lazy to do that because yesterday was not a “let’s open gedit and code” kind of day.

But I was able to fix my code and you were right, I needed more synchronization calls. In fact, my code only works when I sandwich the thrust::inclusive_scan() operation between cudaDeviceSynchronize() calls.

This is the now working (and shorter example) :

void regulus(point *points, tetra *mesh, int np)
{
    int tpb = 512;
    int blk = np / tpb + (np % tpb ? 1 : 0);

    PredicateInfo host_preds;
    initPredicate(host_preds);

    PredicateInfo *preds;
    cudaMalloc(&preds, sizeof(*preds));
    cudaMemcpy(preds, &host_preds, sizeof(*preds), cudaMemcpyHostToDevice );

    int *pa = 0, *ta = 0;
    cudaMalloc(&pa, np*sizeof(*pa));
    cudaMalloc(&ta, np*sizeof(*ta));

    int *nt = 0;
    cudaMallocManaged(&nt, sizeof(*nt));
    *nt = 1;

    for (int i = 0; i < np; ++i)
    {
        int tmp = 4 + i;
        cudaMemcpy(pa + i, &tmp, sizeof(tmp), cudaMemcpyHostToDevice);
    }

    cudaMemset(ta, 0, np*sizeof(*ta));

    for (int i = 0; i < 2; ++i)
    {
        printf("Iterative index : %d\n\n", i);

        float *dist = 0;
        cudaMalloc(&dist, np*sizeof(*dist));

        int *insert = 0, *pfx_sum = 0;
        cudaMalloc(&insert, *nt*sizeof(*insert));
        cudaMalloc(&pfx_sum, *nt*sizeof(*pfx_sum));
        cudaMemset(insert, 0, *nt*sizeof(*insert));
        cudaMemset(pfx_sum, 0, *nt*sizeof(*insert));

        compute_distance<<< blk, tpb >>>(points, mesh, pa, ta, dist, np, preds); 
        compare_distance<<< blk, tpb >>>(points, mesh, pa, ta, dist, insert, np);  

        cudaDeviceSynchronize();

        thrust::inclusive_scan(
            thrust::device_pointer_cast(insert), 
            thrust::device_pointer_cast(insert) + *nt, 
            thrust::device_pointer_cast(pfx_sum)
        ); 

        cudaDeviceSynchronize();

        fracture<<< *nt / tpb + (*nt % tpb ? 1 : 0), tpb >>>(points, mesh, pa, insert, pfx_sum, nt);

        cudaDeviceSynchronize();

        cudaFree(pfx_sum);
        cudaFree(insert);
        cudaFree(dist);
    }

    cudaFree(nt);
    cudaFree(ta);
    cudaFree(pa);
    cudaFree(preds);   
}