cudaOpenMP failed to pass correctResult

AskCuda · November 28, 2016, 9:36pm

I tried to increase the number of element n = 8129 to n = 2 * 3200 * 2 * 8192 * 8 and
then ran the code but unfortunately it failed to pass correctResult. If I increase n
to 2 * 3200 * 2 * 8192 * 4 then it passed the check. the GPU was GTX TITAN X with 12G
memory.

/*

Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
Please refer to the NVIDIA end user license agreement (EULA) associated
with this source code for terms and conditions that govern your use of
this software. Any use, reproduction, disclosure, or distribution of
this software and related documentation outside the terms of the EULA
is strictly prohibited.

*/

/*

Multi-GPU sample using OpenMP for threading on the CPU side
needs a compiler that supports OpenMP 2.0
*/

#include <omp.h>
#include <stdio.h> // stdio functions are used since C++ streams aren’t necessarily thread safe
#include <helper_cuda.h>

using namespace std;

// a simple kernel that simply increments each array element by b
global void kernelAddConstant(unsigned int *g_a, const unsigned int b)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
g_a[idx] += b;
}

// a predicate that checks whether each array element is set to its index plus b
int correctResult(unsigned int *data, const unsigned int n, const unsigned int b)
{
int success = 1;
for (unsigned int i = 0; i < n; i++)
{
if (data[i] != i + b)
{
printf(“%u %u %u\n”, i, data[i], i + b);
success = 0;
return success;
}
}

return success;

}

int main(int argc, char *argv)
{
int num_gpus = 0; // number of CUDA GPUs

printf("%s Starting...\n\n", argv[0]);

/////////////////////////////////////////////////////////////////
// determine the number of CUDA capable GPUs
//
cudaGetDeviceCount(&num_gpus);

if (num_gpus < 1)
{
    printf("no CUDA capable devices were detected\n");
    return 1;
}

/////////////////////////////////////////////////////////////////
// display CPU and GPU configuration
//
printf("number of host CPUs:\t%d\n", omp_get_num_procs());
printf("number of CUDA devices:\t%d\n", num_gpus);

for (int i = 0; i < num_gpus; i++)
{
    cudaDeviceProp dprop;
    cudaGetDeviceProperties(&dprop, i);
    printf("   %d: %s\n", i, dprop.name);
}

printf("---------------------------\n");

printf("sizeof(unsigned int) = %u\n", sizeof(unsigned int));

/////////////////////////////////////////////////////////////////
// initialize data
//
unsigned int n = 2 * 3200 * num_gpus * 8192 * 8;
unsigned int nbytes = n * sizeof(unsigned int);
unsigned int *a = 0;     // pointer to data on the CPU
unsigned int b = 3;      // value by which the array is incremented
a = (unsigned int *)malloc(nbytes);

if (0 == a)
{
    printf("couldn't allocate CPU memory\n");
    return 1;
}

for (unsigned int i = 0; i < n; i++)
    a[i] = i;

////////////////////////////////////////////////////////////////
// run as many CPU threads as there are CUDA devices
//   each CPU thread controls a different device, processing its
//   portion of the data.  It's possible to use more CPU threads
//   than there are CUDA devices, in which case several CPU
//   threads will be allocating resources and launching kernels
//   on the same device.  For example, try omp_set_num_threads(2*num_gpus);
//   Recall that all variables declared inside an "omp parallel" scope are
//   local to each CPU thread
//
//omp_set_num_threads(num_gpus);  // create as many CPU threads as there are CUDA devices
omp_set_num_threads(8*num_gpus);// create twice as many CPU threads as there are CUDA devices
#pragma omp parallel shared(n, nbytes, a, b)
{
    unsigned int cpu_thread_id = omp_get_thread_num();
    unsigned int num_cpu_threads = omp_get_num_threads();

// // set and check the CUDA device for this CPU thread
// int gpu_id = -1;
// checkCudaErrors(cudaSetDevice(cpu_thread_id % num_gpus)); // “% num_gpus” allows more CPU threads than GPU devices
// checkCudaErrors(cudaGetDevice(&gpu_id));
// printf(“CPU thread %d (of %d) uses CUDA device %d\n”, cpu_thread_id, num_cpu_threads, gpu_id);

    // set and check the CUDA device for this CPU thread
	int gpu_id = 0;
	checkCudaErrors(cudaSetDevice(gpu_id)); // "% num_gpus" allows more CPU threads than GPU devices
	checkCudaErrors(cudaGetDevice(&gpu_id));
	printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id,
			num_cpu_threads, gpu_id);


	unsigned int *d_a = 0;   // pointer to memory on the device associated with this CPU thread
    unsigned int *sub_a = a + cpu_thread_id * n / num_cpu_threads;   // pointer to this CPU thread's portion of data
    unsigned int nbytes_per_kernel = nbytes / num_cpu_threads;
    dim3 gpu_threads(128);  // 128 threads per block
    dim3 gpu_blocks(n / (gpu_threads.x * num_cpu_threads));

    checkCudaErrors(cudaMalloc((void **)&d_a, nbytes_per_kernel));
    checkCudaErrors(cudaMemset(d_a, 0, nbytes_per_kernel));
    checkCudaErrors(cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice));
    kernelAddConstant<<<gpu_blocks, gpu_threads>>>(d_a, b);

    checkCudaErrors(cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost));
    checkCudaErrors(cudaFree(d_a));

}
printf("---------------------------\n");

if (cudaSuccess != cudaGetLastError())
    printf("%s\n", cudaGetErrorString(cudaGetLastError()));


////////////////////////////////////////////////////////////////
// check the result
//
bool bResult = correctResult(a, n, b);

if (a)
    free(a); // free CPU memory

// cudaDeviceReset causes the driver to clean up all state. While
// not mandatory in normal operation, it is good practice.  It is also
// needed to ensure correct operation when the application is being
// profiled. Calling cudaDeviceReset causes all profile data to be
// flushed before the application exits
cudaDeviceReset();

if (bResult) {
	printf("bResult = true\n");
} else {
	printf("bResult = false\n");
}

exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);

}

AskCuda · November 29, 2016, 4:56pm

Problem solved.

Topic		Replies	Views
i_have_problem_with_openmp_cuda Visual Profiler and nvprof	0	1520	September 12, 2015
CUDA & openMP Problem with the SDK sample code CUDA Programming and Performance	11	14095	September 12, 2015
OpenMP Multi-GPU, not getting speedup expected CUDA Programming and Performance	5	5905	July 15, 2011
CUDA + OpenMP oddity - looks like a compiler bug. Legacy PGI Compilers	6	12217	April 12, 2010
cudaOpenMP ? only one thread with 2 GPU's? CUDA Programming and Performance	1	11076	March 18, 2011
Problems on OpenMP and multi-GPU Legacy PGI Compilers	5	4556	August 15, 2012
Run-time error for multi-gpu programming with openmp (pgfort Legacy PGI Compilers	11	11620	January 25, 2014
Unexpected behavior with varying number of threads per block CUDA Programming and Performance	2	3461	November 5, 2008
Cuda + omp = big slowdown CUDA Programming and Performance	4	1348	August 20, 2013
Openmp + CUDA how to handle multi GPUs with openmp CUDA Programming and Performance	1	1149	June 10, 2009

cudaOpenMP failed to pass correctResult

Related topics