same fn host and device with conditional compilation

If I have the same function marked host and __device, is there some kind of #if defined isDevice that
I can do inside to make some conditional compilation code blocks?
e.g.
host device SomeCommonCode( …)
{
#if defined isDevice
maybe allocate some memory as shared maybe- only for the device version
#else
… stuff appropriate for only the cpu
#endif
}

For example

#include <stdio.h>

#include <stdlib.h>

#include <cuda_runtime.h>

#include <assert.h>

template<int HOST>

__host__ __device__ int  new_atomicAdd(int *addr, int val)

{

    if ( HOST ){

        int old = addr[0];

        addr[0] = old + val ;

        return old ;

    }else{

#ifdef __CUDA_ARCH__

    #if  __CUDA_ARCH__ >= 110

        return atomicAdd( addr, val ) ;

    #else

        #error atomicAdd does not support sm10

    #endif

#else

        return 0 ;

#endif

}

}

__global__  void  foo( int *data, int *semaphore )

{

    int tid = threadIdx.x ;

    data[tid] = new_atomicAdd<0>(semaphore, 1);

}

int main(void)

{

    cudaError_t status ;

    int *h_semaphore = NULL ;

    int *d_semaphore = NULL ;

    int *d_data = NULL ;

    int *h_data = NULL ;

    const int size = 8 ;

h_semaphore = (int*)malloc(sizeof(int));

    assert( NULL != h_semaphore);

h_data = (int*)malloc(sizeof(int)*size);

    assert( NULL != h_data );

status = cudaMalloc((void**)&d_semaphore, sizeof(int));

    assert( cudaSuccess == status );

status = cudaMemset( d_semaphore, 0, sizeof(int));

    assert( cudaSuccess == status );

status = cudaMalloc((void**)&d_data, sizeof(int)*size );

    assert( cudaSuccess == status );

foo<<<1,size>>>(d_data, d_semaphore);

status = cudaMemcpy(h_data, d_data, sizeof(int)*size, cudaMemcpyDeviceToHost);

    assert( cudaSuccess == status );

for(int i = 0 ; i < size ; i++){

        printf("h_data[%d] = %d\n", i, h_data[i]);

    }

h_semaphore[0] = 0;

    for(int i = 0 ; i < 3 ; i++){

        printf("atomicAdd(h_semaphore,1) = %d\n", new_atomicAdd<1>(h_semaphore,1));

    }

free( h_semaphore );

    free( h_data );

    cudaFree( d_semaphore );

    cudaFree( d_data );

    cudaThreadExit();

    return 0;

}

execute “nvcc -run -arch=sm_20 main.cu”, then result is

h_data[0] = 0

h_data[1] = 1

h_data[2] = 2

h_data[3] = 3

h_data[4] = 4

h_data[5] = 5

h_data[6] = 6

h_data[7] = 7

atomicAdd(h_semaphore,1) = 0

atomicAdd(h_semaphore,1) = 1

atomicAdd(h_semaphore,1) = 2

thanks. perfect!

I’d like to reopen this question…

I would like to write:

__host__ void set_abm_error(const char *s)

{

    strcpy(abm_error_buffer, s);

}

__device__ void set_abm_error(const char *s)

{

    // If invoked from a kernel, just print the error to stdout

    printf("ABM Error: %s\n", s);

}

This generates a compiler error “function ‘void set_abm_error(const char *)’ already has a body”. It seems to me that this should be perfectly legal and simply generate two different bodies for host and device.

The workaround offered by LSChien is pretty clever, and might be the only solution, but it would clutter my code quite a bit because the template specification would have to be implemented all the way up the food chain into everything that indirectly or directly calls my function. This is a problem because I would like invoke my function from within various small methods such as math operator overload methods in classes that are used by both host and device code.

Why is the compiler choking on the above code? I would expect device to mean “compile the following to the device binary only” and host to mean “compile the following to the host binary only”, in which case there should be no name conflict.

Alternatively, the original proposal by LHickey, to provide a preprocessor conditional macro would be very convenient.

Ken

__host__ __device__ void set_abm_error(const char *s)

{

#ifdef __CUDA_ARCH__

    printf("ABM Error: %s\n", s);

#else

    strcpy(abm_error_buffer, s);

#endif

}