If I have the same function marked host and __device, is there some kind of #if defined isDevice that
I can do inside to make some conditional compilation code blocks?
e.g.
host device SomeCommonCode( …)
{
#if defined isDevice
maybe allocate some memory as shared maybe- only for the device version
#else
… stuff appropriate for only the cpu
#endif
}
For example
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <assert.h>
template<int HOST>
__host__ __device__ int new_atomicAdd(int *addr, int val)
{
if ( HOST ){
int old = addr[0];
addr[0] = old + val ;
return old ;
}else{
#ifdef __CUDA_ARCH__
#if __CUDA_ARCH__ >= 110
return atomicAdd( addr, val ) ;
#else
#error atomicAdd does not support sm10
#endif
#else
return 0 ;
#endif
}
}
__global__ void foo( int *data, int *semaphore )
{
int tid = threadIdx.x ;
data[tid] = new_atomicAdd<0>(semaphore, 1);
}
int main(void)
{
cudaError_t status ;
int *h_semaphore = NULL ;
int *d_semaphore = NULL ;
int *d_data = NULL ;
int *h_data = NULL ;
const int size = 8 ;
h_semaphore = (int*)malloc(sizeof(int));
assert( NULL != h_semaphore);
h_data = (int*)malloc(sizeof(int)*size);
assert( NULL != h_data );
status = cudaMalloc((void**)&d_semaphore, sizeof(int));
assert( cudaSuccess == status );
status = cudaMemset( d_semaphore, 0, sizeof(int));
assert( cudaSuccess == status );
status = cudaMalloc((void**)&d_data, sizeof(int)*size );
assert( cudaSuccess == status );
foo<<<1,size>>>(d_data, d_semaphore);
status = cudaMemcpy(h_data, d_data, sizeof(int)*size, cudaMemcpyDeviceToHost);
assert( cudaSuccess == status );
for(int i = 0 ; i < size ; i++){
printf("h_data[%d] = %d\n", i, h_data[i]);
}
h_semaphore[0] = 0;
for(int i = 0 ; i < 3 ; i++){
printf("atomicAdd(h_semaphore,1) = %d\n", new_atomicAdd<1>(h_semaphore,1));
}
free( h_semaphore );
free( h_data );
cudaFree( d_semaphore );
cudaFree( d_data );
cudaThreadExit();
return 0;
}
execute “nvcc -run -arch=sm_20 main.cu”, then result is
h_data[0] = 0
h_data[1] = 1
h_data[2] = 2
h_data[3] = 3
h_data[4] = 4
h_data[5] = 5
h_data[6] = 6
h_data[7] = 7
atomicAdd(h_semaphore,1) = 0
atomicAdd(h_semaphore,1) = 1
atomicAdd(h_semaphore,1) = 2
thanks. perfect!
I’d like to reopen this question…
I would like to write:
__host__ void set_abm_error(const char *s)
{
strcpy(abm_error_buffer, s);
}
__device__ void set_abm_error(const char *s)
{
// If invoked from a kernel, just print the error to stdout
printf("ABM Error: %s\n", s);
}
This generates a compiler error “function ‘void set_abm_error(const char *)’ already has a body”. It seems to me that this should be perfectly legal and simply generate two different bodies for host and device.
The workaround offered by LSChien is pretty clever, and might be the only solution, but it would clutter my code quite a bit because the template specification would have to be implemented all the way up the food chain into everything that indirectly or directly calls my function. This is a problem because I would like invoke my function from within various small methods such as math operator overload methods in classes that are used by both host and device code.
Why is the compiler choking on the above code? I would expect device to mean “compile the following to the device binary only” and host to mean “compile the following to the host binary only”, in which case there should be no name conflict.
Alternatively, the original proposal by LHickey, to provide a preprocessor conditional macro would be very convenient.
Ken
__host__ __device__ void set_abm_error(const char *s)
{
#ifdef __CUDA_ARCH__
printf("ABM Error: %s\n", s);
#else
strcpy(abm_error_buffer, s);
#endif
}