OpenMP doesn't work in a templated function

FYI – A strange and annoying behavior (CUDA 2.3).

When compiling with nvcc, “#pragma omp parallel” doesn’t work in a templated function (i.e. only 1 thread executes).

It works fine in a non-template function.

When compiling directly with Visual C++ it works, as expected, in either type of function.

Test case is trivial modification of classic cudaOpenMP example.

-Mike

[codebox]/*

  • Copyright 1993-2009 NVIDIA Corporation. All rights reserved.

  • NVIDIA Corporation and its licensors retain all intellectual property and

  • proprietary rights in and to this software and related documentation and

  • any modifications thereto. Any use, reproduction, disclosure, or distribution

  • of this software and related documentation without an express license

  • agreement from NVIDIA Corporation is strictly prohibited.

*/

/*

  • Multi-GPU sample using OpenMP for threading on the CPU side

  • needs a compiler that supports OpenMP 2.0

*/

#include <omp.h>

#include <stdio.h> // stdio functions are used since C++ streams aren’t necessarily thread safe

#include <cutil_inline.h>

using namespace std;

// a simple kernel that simply increments each array element by b

global void kernelAddConstant(int *g_a, const int B)

{

int idx = blockIdx.x * blockDim.x + threadIdx.x;

g_a[idx] += b;

}

// a predicate that checks whether each array elemen is set to its index plus b

int correctResult(int *data, const int n, const int B)

{

for(int i = 0; i < n; i++)

	if(data[i] != i + B)

		return 0;

return 1;

}

int testFunc1( int n )

{

omp_set_num_threads( n );

#pragma omp parallel

{

  int i = omp_get_thread_num();

  int n = omp_get_num_threads();

  printf_s("testFunc1: Hello from thread %d, num threads = %d\n", i, n);

}

return 0;

}

template

int testFunc2( T n )

{

omp_set_num_threads( n );

#pragma omp parallel

{

  int i = omp_get_thread_num();

  int n = omp_get_num_threads();

  printf_s("testFunc2: Hello from thread %d, num threads = %d\n", i, n);

}

return 0;

}

int main(int argc, char *argv)

{

int num_gpus = 0;	// number of CUDA GPUs

testFunc1( 4 );

testFunc2( 3 );

/////////////////////////////////////////////////////////////////

// determine the number of CUDA capable GPUs

//

cudaGetDeviceCount(&num_gpus);

if(num_gpus < 1)

{

	printf("no CUDA capable devices were detected\n");

	return 1;

}

/////////////////////////////////////////////////////////////////

// display CPU and GPU configuration

//

printf("number of host CPUs:\t%d\n", omp_get_num_procs());

printf("number of CUDA devices:\t%d\n", num_gpus);

for(int i = 0; i < num_gpus; i++)

{

    cudaDeviceProp dprop;

    cudaGetDeviceProperties(&dprop, i);

	printf("   %d: %s\n", i, dprop.name);

}

printf("---------------------------\n");

/////////////////////////////////////////////////////////////////

// initialize data

//

unsigned int n = num_gpus * 8192;

unsigned int nbytes = n * sizeof(int);

int *a = 0;		// pointer to data on the CPU

int b = 3;		// value by which the array is incremented

a = (int*)malloc(nbytes);

if(0 == a)

{

	printf("couldn't allocate CPU memory\n");

	return 1;

}

for(unsigned int i = 0; i < n; i++)

    a[i] = i;

////////////////////////////////////////////////////////////////

// run as many CPU threads as there are CUDA devices

//   each CPU thread controls a different device, processing its

//   portion of the data.  It's possible to use more CPU threads

//   than there are CUDA devices, in which case several CPU

//   threads will be allocating resources and launching kernels

//   on the same device.  For example, try omp_set_num_threads(2*num_gpus);

//   Recall that all variables declared inside an "omp parallel" scope are

//   local to each CPU thread

//

omp_set_num_threads(num_gpus);	// create as many CPU threads as there are CUDA devices

//omp_set_num_threads(2*num_gpus);// create twice as many CPU threads as there are CUDA devices

printf( "Before parallel section, num_threads: %d max_threads: %d\n", 

  omp_get_num_threads(), omp_get_max_threads() );

#pragma omp parallel

{		

    unsigned int cpu_thread_id = omp_get_thread_num();

	unsigned int num_cpu_threads = omp_get_num_threads();

	

	// set and check the CUDA device for this CPU thread

	int gpu_id = -1;

	CUDA_SAFE_CALL(cudaSetDevice(cpu_thread_id % num_gpus));	// "% num_gpus" allows more CPU threads than GPU devices

	CUDA_SAFE_CALL(cudaGetDevice(&gpu_id));

	printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, num_cpu_threads, gpu_id);

	int *d_a = 0;	// pointer to memory on the device associated with this CPU thread

	int *sub_a = a + cpu_thread_id * n / num_cpu_threads;	// pointer to this CPU thread's portion of data

	unsigned int nbytes_per_kernel = nbytes / num_cpu_threads;

	dim3 gpu_threads(128);	// 128 threads per block

	dim3 gpu_blocks(n / (gpu_threads.x * num_cpu_threads));

CUDA_SAFE_CALL(cudaMalloc((void**)&d_a, nbytes_per_kernel));

    CUDA_SAFE_CALL(cudaMemset(d_a, 0, nbytes_per_kernel));

    CUDA_SAFE_CALL(cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice));

    kernelAddConstant<<<gpu_blocks, gpu_threads>>>(d_a, B);

CUDA_SAFE_CALL(cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost));

    CUDA_SAFE_CALL(cudaFree(d_a));

}

printf("---------------------------\n");



if(cudaSuccess != cudaGetLastError())

	printf("%s\n", cudaGetErrorString(cudaGetLastError()));

////////////////////////////////////////////////////////////////

// check the result

//

if(correctResult(a, n, B))

    printf("Test PASSED\n");

else

    printf("Test FAILED\n");

free(a); // free CPU memory

cudaThreadExit();

cutilExit(argc, argv);

return 0;

}

[/codebox]

Hmm, no response…

Is there a more “official” way to submit a bug report on nvcc?

This is not a show stopper by any means, but OpenMP is convenient and elegant for managing multiple GPUs and a templated function is convenient and elegant for maintaining single and double precision versions of my GPU manager. It would be nice if they worked together in the next version of CUDA.

Thanks,

Mike

We’re tracking this internally. (registered dev site gives you access to a more formal bug report database)

Thanks. I guessed that was the case. I just wanted to make sure it was a reasonable report, enough info, etc.

Cheers,

Mike

Coincidentally, I reported the same bug yesterday (nvbug #597197)