OpenMP doesn't work in a templated function

mikeheck · September 6, 2009, 9:02pm

FYI – A strange and annoying behavior (CUDA 2.3).

When compiling with nvcc, “#pragma omp parallel” doesn’t work in a templated function (i.e. only 1 thread executes).

It works fine in a non-template function.

When compiling directly with Visual C++ it works, as expected, in either type of function.

Test case is trivial modification of classic cudaOpenMP example.

-Mike

[codebox]/*

Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
NVIDIA Corporation and its licensors retain all intellectual property and
proprietary rights in and to this software and related documentation and
any modifications thereto. Any use, reproduction, disclosure, or distribution
of this software and related documentation without an express license
agreement from NVIDIA Corporation is strictly prohibited.

*/

/*

Multi-GPU sample using OpenMP for threading on the CPU side
needs a compiler that supports OpenMP 2.0

*/

#include <omp.h>

#include <stdio.h> // stdio functions are used since C++ streams aren’t necessarily thread safe

#include <cutil_inline.h>

using namespace std;

// a simple kernel that simply increments each array element by b

global void kernelAddConstant(int *g_a, const int B)

{

int idx = blockIdx.x * blockDim.x + threadIdx.x;

g_a[idx] += b;

}

// a predicate that checks whether each array elemen is set to its index plus b

int correctResult(int *data, const int n, const int B)

{

for(int i = 0; i < n; i++)

	if(data[i] != i + B)

		return 0;

return 1;

}

int testFunc1( int n )

{

omp_set_num_threads( n );

#pragma omp parallel

{

  int i = omp_get_thread_num();

  int n = omp_get_num_threads();

  printf_s("testFunc1: Hello from thread %d, num threads = %d\n", i, n);

}

return 0;

}

template

int testFunc2( T n )

{

omp_set_num_threads( n );

#pragma omp parallel

{

  int i = omp_get_thread_num();

  int n = omp_get_num_threads();

  printf_s("testFunc2: Hello from thread %d, num threads = %d\n", i, n);

}

return 0;

}

int main(int argc, char *argv)

{

int num_gpus = 0;	// number of CUDA GPUs

testFunc1( 4 );

testFunc2( 3 );

/////////////////////////////////////////////////////////////////

// determine the number of CUDA capable GPUs

//

cudaGetDeviceCount(&num_gpus);

if(num_gpus < 1)

{

	printf("no CUDA capable devices were detected\n");

	return 1;

}

/////////////////////////////////////////////////////////////////

// display CPU and GPU configuration

//

printf("number of host CPUs:\t%d\n", omp_get_num_procs());

printf("number of CUDA devices:\t%d\n", num_gpus);

for(int i = 0; i < num_gpus; i++)

{

    cudaDeviceProp dprop;

    cudaGetDeviceProperties(&dprop, i);

	printf("   %d: %s\n", i, dprop.name);

}

printf("---------------------------\n");

/////////////////////////////////////////////////////////////////

// initialize data

//

unsigned int n = num_gpus * 8192;

unsigned int nbytes = n * sizeof(int);

int *a = 0;		// pointer to data on the CPU

int b = 3;		// value by which the array is incremented

a = (int*)malloc(nbytes);

if(0 == a)

{

	printf("couldn't allocate CPU memory\n");

	return 1;

}

for(unsigned int i = 0; i < n; i++)

    a[i] = i;

////////////////////////////////////////////////////////////////

// run as many CPU threads as there are CUDA devices

//   each CPU thread controls a different device, processing its

//   portion of the data.  It's possible to use more CPU threads

//   than there are CUDA devices, in which case several CPU

//   threads will be allocating resources and launching kernels

//   on the same device.  For example, try omp_set_num_threads(2*num_gpus);

//   Recall that all variables declared inside an "omp parallel" scope are

//   local to each CPU thread

//

omp_set_num_threads(num_gpus);	// create as many CPU threads as there are CUDA devices

//omp_set_num_threads(2*num_gpus);// create twice as many CPU threads as there are CUDA devices

printf( "Before parallel section, num_threads: %d max_threads: %d\n", 

  omp_get_num_threads(), omp_get_max_threads() );

#pragma omp parallel

{		

    unsigned int cpu_thread_id = omp_get_thread_num();

	unsigned int num_cpu_threads = omp_get_num_threads();

	

	// set and check the CUDA device for this CPU thread

	int gpu_id = -1;

	CUDA_SAFE_CALL(cudaSetDevice(cpu_thread_id % num_gpus));	// "% num_gpus" allows more CPU threads than GPU devices

	CUDA_SAFE_CALL(cudaGetDevice(&gpu_id));

	printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, num_cpu_threads, gpu_id);

	int *d_a = 0;	// pointer to memory on the device associated with this CPU thread

	int *sub_a = a + cpu_thread_id * n / num_cpu_threads;	// pointer to this CPU thread's portion of data

	unsigned int nbytes_per_kernel = nbytes / num_cpu_threads;

	dim3 gpu_threads(128);	// 128 threads per block

	dim3 gpu_blocks(n / (gpu_threads.x * num_cpu_threads));

CUDA_SAFE_CALL(cudaMalloc((void**)&d_a, nbytes_per_kernel));

    CUDA_SAFE_CALL(cudaMemset(d_a, 0, nbytes_per_kernel));

    CUDA_SAFE_CALL(cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice));

    kernelAddConstant<<<gpu_blocks, gpu_threads>>>(d_a, B);

CUDA_SAFE_CALL(cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost));

    CUDA_SAFE_CALL(cudaFree(d_a));

}

printf("---------------------------\n");



if(cudaSuccess != cudaGetLastError())

	printf("%s\n", cudaGetErrorString(cudaGetLastError()));

////////////////////////////////////////////////////////////////

// check the result

//

if(correctResult(a, n, B))

    printf("Test PASSED\n");

else

    printf("Test FAILED\n");

free(a); // free CPU memory

cudaThreadExit();

cutilExit(argc, argv);

return 0;

}

[/codebox]

mikeheck · September 14, 2009, 5:40pm

Hmm, no response…

Is there a more “official” way to submit a bug report on nvcc?

This is not a show stopper by any means, but OpenMP is convenient and elegant for managing multiple GPUs and a templated function is convenient and elegant for maintaining single and double precision versions of my GPU manager. It would be nice if they worked together in the next version of CUDA.

Thanks,

Mike

tmurray · September 14, 2009, 7:05pm

We’re tracking this internally. (registered dev site gives you access to a more formal bug report database)

mikeheck · September 14, 2009, 8:23pm

Thanks. I guessed that was the case. I just wanted to make sure it was a reasonable report, enough info, etc.

Cheers,

Mike

nbell · September 14, 2009, 10:44pm

Coincidentally, I reported the same bug yesterday (nvbug #597197)

Topic		Replies	Views
CUDA & openMP Problem with the SDK sample code CUDA Programming and Performance	11	13998	September 12, 2015
OpenMP + CUDA Multiple Parallel Sections Does GPU to Thread linking persist across multiple parallel CUDA Programming and Performance	11	3499	June 29, 2011
cudaOpenMP failed to pass correctResult CUDA Programming and Performance	1	615	November 29, 2016
How to Avoid Re-Creating CUDA Context with Multi-GPU using OpenMP? Call a function that does omp par CUDA Programming and Performance	2	1822	July 28, 2009
Calling a kernel function from an OpenMP loop CUDA Programming and Performance	2	1343	August 27, 2021
OpenMP offload w/ CUDA interop: undefined reference to `__fatbinwrap__NV_MODULE_ID' nvc, nvc++ and nvfortran	5	992	May 22, 2023
Is CUDA C or C++ ? CUDA Programming and Performance	12	33636	January 30, 2009
Starting with CUDA 12.4, nvcc can't deduce a template type in template function under weird conditions CUDA NVCC Compiler	6	243	July 19, 2024
How use openmp in .cu file ? CUDA Programming and Performance	9	31450	March 31, 2010
OpenMP + different GTX GPUs + Driver > v391.35 (Win 10 / Win 7) CUDA Programming and Performance	15	1544	August 31, 2018

OpenMP doesn't work in a templated function

Related topics