How to correctly use inline asm to make a vector load

Hello ! I was trying to use some inline asm to perform a vector load. This version seems to work fine:

__device__ void make_load(const float4* w) 
{
  float4 data0;
  asm volatile ("ld.global.f32 %0, [%1];" : "=f"(data0.x) : "l"(w + 0));
  asm volatile ("ld.global.f32 %0, [%1];" : "=f"(data0.y) : "l"(w + 1));
  asm volatile ("ld.global.f32 %0, [%1];" : "=f"(data0.z) : "l"(w + 2));
  asm volatile ("ld.global.f32 %0, [%1];" : "=f"(data0.w) : "l"(w + 3));
  ...
}

However, this version with the vector load gives me an “Invalid __global__ read of size 16 bytes”:

__device__ void make_load(const float4* w) 
{
  float4 data0;
  asm volatile ("ld.global.v4.f32 { %0, %1, %2, %3 }, [%4];" : "=f"(data0.x) , "=f"(data0.y) , "=f"(data0.z) , "=f"(data0.w) : "l"(w));
  ...
}

Im sure it must be either a syntax or alignment mistake on the pointer. However I have gone through some documentation and posts and Im failing to see any issues with the code.

Thanks in advance

Whatever is triggering that error report is something outside the posted code. Compare this, which works just fine:

#include <stdio.h>
#include <stdlib.h>

__device__ float4 make_load (const float4* w) 
{
    float4 data0;
    asm volatile ("ld.global.v4.f32 { %0, %1, %2, %3 }, [%4];" : "=f"(data0.x) , "=f"(data0.y) , "=f"(data0.z) , "=f"(data0.w) : "l"(w));
    return data0;
}

__global__ void kernel (float4 *p)
{
    float4 data = make_load (p);
    printf ("data = {%15.8e %15.8e %15.8e %15.8e}\n", data.x, data.y, data.z, data.w);
}

int main (void)
{
    float4 hdata = {1.0f, 2.0f, 3.0f, 4.0f};
    float4 *ddata = 0;
    cudaMalloc ((void**)&ddata, sizeof hdata);
    cudaMemcpy (ddata, &hdata, sizeof hdata, cudaMemcpyHostToDevice);
    kernel<<<1,1>>>(ddata);
    cudaDeviceSynchronize();
    return EXIT_SUCCESS;
}

Is your data aligned to 16 bytes? Or perhaps you have to translate your address with __cvta_generic_to_global()?

@njuffa seems to be right, my issue was the typical mistake, I was trying to load from global using a constant memory address.

Why did it work without the vector load ? No clue, it should have failed as well.

Thanks !

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.