Generating inline ptx assembly using templates?

Hello,
does someone know if it is possible to construct a string literal for inline ptx assembly at compile time? What I would like to do is have a device function template, taking multiple enums as template parameters to specialize the assembly string.

Something like this:

//needs sm_70

enum class Sem { relaxed = 0, acquire = 1, release = 2, acq_rel = 3};
constexpr const char* sem_ids[] = {".relaxed", ".acquire", ".release", ".acq_rel"};

template<Sem sem>
__device__
int atomicExchBlock2(int* adress, int val){   
    int res;
    constexpr auto a = "atom";
    constexpr auto b = sem_ids[int(sem)];
    constexpr auto c = ".cta.global.exch.b32 %0, [%1], %2;";
    constexpr auto ptx = magically concat a,b,c;
    asm(ptx : "=r"(res) : "l"(adress), "r"(val));
    return res;
}

Of course, this simple example could be solved using template specialization.

template<Sem sem>
struct AtomicExchBlock2;

template<>
struct AtomicExchBlock2<Sem::relaxed>{
    __device__
    int operator()(int* adress, int val){
        int res;        
        asm("atom.relaxed.cta.global.exch.b32 %0, [%1], %2;" : "=r"(res) : "l"(adress), "r"(val));
        return res;
    }
};

template<>
struct AtomicExchBlock2<Sem::acquire>{
    __device__
    int operator()(int* adress, int val){
        int res;        
        asm("atom.acquire.cta.global.exch.b32 %0, [%1], %2;" : "=r"(res) : "l"(adress), "r"(val));
        return res;
    }
};

template<>
struct AtomicExchBlock2<Sem::release>{
    __device__
    int operator()(int* adress, int val){
        int res;        
        asm("atom.release.cta.global.exch.b32 %0, [%1], %2;" : "=r"(res) : "l"(adress), "r"(val));
        return res;
    }
};

template<>
struct AtomicExchBlock2<Sem::acq_rel>{
    __device__
    int operator()(int* adress, int val){
        int res;        
        asm("atom.acq_rel.cta.global.exch.b32 %0, [%1], %2;" : "=r"(res) : "l"(adress), "r"(val));
        return res;
    }
};

But is it possible without explicit specialization (of all combinations of possible template arguments)?

On the host, it is possible to concatenate strings using templates. However, the same code does not work in device code.
See this toy example below.

#include <cstdio>

namespace str {

    template<int...> 
    struct is;

    template<class,class>  
    struct con_is;

    template<int... as, int... bs>
    struct con_is< is<as...>, is<bs...>> { using type = is<as...,bs...>; };

    template<int N> struct is_maker    { using type = typename con_is< typename is_maker<N-1>::type, is<N-1>>::type; };
    template<>      struct is_maker<0> { using type = is<>;  };

    template<int N> using make_is = typename is_maker<N>::type;

    constexpr auto size(const char* s) {  if(!s) return 0; int i = 0; while(*s != 0){ ++i; ++s;} return i;  }

    template<const char*, class, const char*, class>
    struct concat_impl;

    template<const char* S1, int... I1, const char* S2, int... I2>
    struct concat_impl<S1, is<I1...>, S2, is<I2...>> {
        static constexpr const char value[] { S1[I1]..., S2[I2]..., 0 };
    };

    template<const char* S1, const char* S2>
    constexpr auto concat = concat_impl<S1, make_is<size(S1)>, S2, make_is<size(S2)> >::value;

    template<const char* S1, int... I1, const char* S2, int... I2>
    constexpr const char concat_impl<S1, is<I1...>, S2, is<I2...>>::value[];

} // namespace str





__device__
int atomicExchBlock(int* adress, int val){   
    int res;
    asm("atom.cta.global.exch.b32 %0, [%1], %2;" : "=r"(res) : "l"(adress), "r"(val));
    return res;
}



//needs sm_70

enum class Sem { relaxed = 0, acquire = 1, release = 2, acq_rel = 3};
constexpr const char* sem_ids[] = {".relaxed", ".acquire", ".release", ".acq_rel"};

#if 0

template<Sem sem>
__device__
int atomicExchBlock2(int* adress, int val){   
    int res;
    constexpr auto a = "atom";
    constexpr auto b = sem_ids[int(sem)];
    constexpr auto c = ".cta.global.exch.b32 %0, [%1], %2;";
    // error: a template argument may not reference a non-external entity
    constexpr auto ptx = str::concat<a, str::concat<b, c>>;
    asm(ptx : "=r"(res) : "l"(adress), "r"(val));
    return res;
}

#endif

template<Sem sem>
struct AtomicExchBlock2;

template<>
struct AtomicExchBlock2<Sem::relaxed>{
    __device__
    int operator()(int* adress, int val){
        int res;        
        asm("atom.relaxed.cta.global.exch.b32 %0, [%1], %2;" : "=r"(res) : "l"(adress), "r"(val));
        return res;
    }
};

template<>
struct AtomicExchBlock2<Sem::acquire>{
    __device__
    int operator()(int* adress, int val){
        int res;        
        asm("atom.acquire.cta.global.exch.b32 %0, [%1], %2;" : "=r"(res) : "l"(adress), "r"(val));
        return res;
    }
};

template<>
struct AtomicExchBlock2<Sem::release>{
    __device__
    int operator()(int* adress, int val){
        int res;        
        asm("atom.release.cta.global.exch.b32 %0, [%1], %2;" : "=r"(res) : "l"(adress), "r"(val));
        return res;
    }
};

template<>
struct AtomicExchBlock2<Sem::acq_rel>{
    __device__
    int operator()(int* adress, int val){
        int res;        
        asm("atom.acq_rel.cta.global.exch.b32 %0, [%1], %2;" : "=r"(res) : "l"(adress), "r"(val));
        return res;
    }
};



__global__
void kernel(int* data){
  
    data[1] = atomicExchBlock(data, 2);
  
    //data[1] = atomicExchBlock2(data, 2);
    
    AtomicExchBlock2<Sem::acquire> exch;
    data[1] = exch(data, 2);

}


#if 0

 
constexpr const char a2[] = "hello2 ";
constexpr const char b2[] = "world2";

__device__
void dfunc(){
// error: identifier "str::concat_impl<&::a2,  ::str::is<(int)0, (int)1, (int)2, (int)3, (int)4, (int)5, (int)6 > , &::b2,  ::str::is<(int)0, (int)1, (int)2, (int)3, (int)4, (int)5 > > ::value" is undefined in device code
//  however, it also cannot be declared as device variable, because "A __device__ variable cannot be marked constexpr"
    constexpr auto c = str::concat<a2, b2>;
    printf("c = %s\n", c);
}
#endif


constexpr const char a[] = "hello ";
constexpr const char b[] = "world";

int main() {
   
    
    constexpr auto c = str::concat<a, b>;
    printf("c = %s\n", c);

    int* data; cudaMallocManaged(&data, 128);
    data[0] = 1; 
    data[1] = 13;
    kernel<<<1,1>>>(data);
    cudaDeviceSynchronize();
    printf("%d %d\n", data[0], data[1]);
}