Help with Inline Assembly Syntax

Hi All,

I am trying to use inline assembly to generate different flavor of loads and store. I cannot get the syntax right. Can someone help me please?

Here is the cuda code I try to generate in inline asm.

__global__ void exampleCuda(float * array, float * arrayout) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
   array[tid] = (1+ arrayout[tid]);
}

This is what I wrote, but it is not correct. Where I am doing wrong?

__global__ void ptxCode(float * array, float * arrayout) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    float* ptr = &array[tid];
    float *outPtr = &arrayout[tid];    
    float data;
    asm volatile("ld.global.f32 %0, [%1];" : "=f"(data) : "l"(ptr));
    data++;
    asm volatile("st.global.f32 [%0], %1;" : "=l"(outPtr) : "f"(data));
}

Godbot link

Thanks in advance

I think I found a solution

__global__ void ptxCode(float * array, float * arrayout) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    float* ptr = &array[tid];
    float *outPtr = &arrayout[tid];    
    float data;
    asm volatile ("ld.global.f32 %0, [%1];" : "=f"(data) : "l"(ptr));
    data++;
    asm volatile ("st.global.f32 [%0], %1;" : : "l"(outPtr), "f"(data));
}

But now I cannot get it right with the vector loads. How can I write this code with PTX inline asm?

__global__ void cudaCodeVector(float* array, float * arrayout) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    float4 data = reinterpret_cast<float4*>(array)[tid];
    data.x++;
    data.y++;
    data.z++;
    data.w++;
    reinterpret_cast<float4*>(arrayout)[tid] = data;
}

This might be helpful:
https://stackoverflow.com/questions/56719743/simple-add-of-vectors-in-inline-ptx-cuda