HX_CU_CALL_CHECK(p_cuStreamSynchronize(stream))' (value 1) is not equal to expression 'HX_SUCCESS' (value 0)

Hi everyone, i am trying to use multi devices to benchmark traid operations, but encounter
Fatal error: expression ‘HX_CU_CALL_CHECK(p_cuStreamSynchronize(stream))’ (value 1) is not equal to expression ‘HX_SUCCESS’ (value 0).

void triad(unsigned long datasetSize, unsigned long repetitions, long numTeams, long numThreads)
{

    volatile auto *a = new double[datasetSize];
    auto *b = new double[datasetSize];
    auto *c = new double[datasetSize];
    auto *d = new double[datasetSize];

    int num_dev = omp_get_num_devices();
    int chuckSize = datasetSize / num_dev;

    #pragma omp parallel for
    for (int dev=0; dev < num_dev; dev++) {
        int l = dev * chuckSize;
        int h = (dev + 1) * chuckSize;
        #pragma omp target enter data map(alloc:a[l : chuckSize], b[l : chuckSize], c[l : chuckSize], d[l : chuckSize]) device(dev)
        #pragma omp target parallel for device(dev)
        for (unsigned long i = l; i < h; ++i) {
            a[i] = b[i] = c[i] = d[i] = i;
        }
    }

    auto start = std::chrono::high_resolution_clock::now();

    for (int dev=0; dev < num_dev; dev++) {
        int l = dev * chuckSize;
        int h = (dev + 1) * chuckSize;

        #pragma omp target teams num_teams(numTeams) device(dev)
        for (unsigned long j = 0; j < repetitions; ++j)
        {
            #pragma omp parallel num_threads(numThreads)
            {
                #pragma omp for schedule(static, 2)
                for (unsigned long i = l; i < h; ++i)
                {
                    a[i] = b[i] + c[i] * d[i];
                }
            }
        }
        #pragma omp target exit data map(from : a[l : chuckSize]) device(dev)
    }

    auto stop = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration<double>(stop - start).count();

    double checksum = calculateChecksum(datasetSize, a);

    delete[] a;
    delete[] b;
    delete[] c;
    delete[] d;

    double mflops = calculateMegaFlopRate(datasetSize, repetitions, duration);
repetitions, targetDeviceId, checksum);
    printf("%10ld %8ld %8ld %8.2f %8ld %4d %.4e\n", datasetSize, numTeams, numThreads, mflops, repetitions, targetDeviceId, checksum);
}

Hi somersetma17,

This is a generic error indicating that something errored in the kernel, though exactly what, I’m not sure. Can you provide a full reproducing example that I can use to investigate?

Thanks,
Mat