COMPUTE-SANITIZER " Invalid __shared__ write of size 16 bytes "

bonjour j’ai une erreur de mémoire dans mon code suite à l’utilisation de cp.async.ca.shared global, je crois que n’envoie pas de la bonne façon les adresses mémoire à l’instruction, je ne sais pas si quelqu’un a déjà rencontré le même problème et pourrait m’aider merci

code :

#include <stdint.h>
#include <stdio.h>
#include <iostream>


__global__ void Test(float *nn, float *nn16, float *out){
    __shared__ float s_copy[4];
    asm volatile("cp.async.ca.shared.global [%0], [%1], 16;\n" :: "l"(s_copy+threadIdx.x * 4), "l"(nn+threadIdx.x * 4));
    asm volatile("cp.async.wait_all;\n" ::);
    if (threadIdx.x == 0)
      memcpy(out, s_copy, 4);
}

int main() {
    float* h_C = (float*)malloc(4*sizeof(float));
    float* h_nn = (float*)malloc(393216*sizeof(float));
    for (int i = 0; i < 393216; i = i+1){
        h_nn[i] = 0.11;
    }

    // malloc cuda
    float* d_nn;
    cudaMalloc(&d_nn, 393216);
    float* d_nn16;
    cudaMalloc(&d_nn16, 196608);

    // out
    float* d_C;
    cudaMalloc(&d_C, 4*sizeof(float));
    // copy
    cudaMemcpy(h_nn, d_nn, 393216, cudaMemcpyHostToDevice);

    Test<<<1, 32>>>(d_nn, d_nn16, d_C);
    cudaDeviceSynchronize();
    cudaMemcpy(h_C, d_C, 4*sizeof(float), cudaMemcpyDeviceToHost);
    std::cout << h_C[0];
}

error COMPUTE-SANITIZER:

========= COMPUTE-SANITIZER
========= Program hit invalid argument (error 1) on CUDA API call to cudaMemcpy.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame: [0x355b43]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0x5b74d]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7c77]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, float*, float*)
=========     by thread (0,0,0) in block (0,0,0)
=========     Address 0x51000000 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc71b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe40]
=========                in /home/venus/./a.out
=========     Host Frame: [0x806b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, float*, float*)
=========     by thread (1,0,0) in block (0,0,0)
=========     Address 0x51000010 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc71b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe40]
=========                in /home/venus/./a.out
=========     Host Frame: [0x806b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, float*, float*)
=========     by thread (2,0,0) in block (0,0,0)
=========     Address 0x51000020 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc71b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe40]
=========                in /home/venus/./a.out
=========     Host Frame: [0x806b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, float*, float*)
=========     by thread (3,0,0) in block (0,0,0)
=========     Address 0x51000030 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc71b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe40]
=========                in /home/venus/./a.out
=========     Host Frame: [0x806b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, float*, float*)
=========     by thread (4,0,0) in block (0,0,0)
=========     Address 0x51000040 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc71b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe40]
=========                in /home/venus/./a.out
=========     Host Frame: [0x806b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, float*, float*)
=========     by thread (5,0,0) in block (0,0,0)
=========     Address 0x51000050 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc71b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe40]
=========                in /home/venus/./a.out
=========     Host Frame: [0x806b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, float*, float*)
=========     by thread (6,0,0) in block (0,0,0)
=========     Address 0x51000060 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc71b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe40]
=========                in /home/venus/./a.out
=========     Host Frame: [0x806b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, float*, float*)
=========     by thread (7,0,0) in block (0,0,0)
=========     Address 0x51000070 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc71b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe40]
=========                in /home/venus/./a.out
=========     Host Frame: [0x806b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, float*, float*)
=========     by thread (8,0,0) in block (0,0,0)
=========     Address 0x51000080 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc71b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe40]
=========                in /home/venus/./a.out
=========     Host Frame: [0x806b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, float*, float*)
=========     by thread (9,0,0) in block (0,0,0)
=========     Address 0x51000090 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc71b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe40]
=========                in /home/venus/./a.out
=========     Host Frame: [0x806b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, float*, float*)
=========     by thread (10,0,0) in block (0,0,0)
=========     Address 0x510000a0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc71b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe40]
=========                in /home/venus/./a.out
=========     Host Frame: [0x806b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, float*, float*)
=========     by thread (11,0,0) in block (0,0,0)
=========     Address 0x510000b0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc71b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe40]
=========                in /home/venus/./a.out
=========     Host Frame: [0x806b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, float*, float*)
=========     by thread (12,0,0) in block (0,0,0)
=========     Address 0x510000c0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc71b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe40]
=========                in /home/venus/./a.out
=========     Host Frame: [0x806b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, float*, float*)
=========     by thread (13,0,0) in block (0,0,0)
=========     Address 0x510000d0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc71b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe40]
=========                in /home/venus/./a.out
=========     Host Frame: [0x806b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, float*, float*)
=========     by thread (14,0,0) in block (0,0,0)
=========     Address 0x510000e0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc71b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe40]
=========                in /home/venus/./a.out
=========     Host Frame: [0x806b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, float*, float*)
=========     by thread (15,0,0) in block (0,0,0)
=========     Address 0x510000f0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc71b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe40]
=========                in /home/venus/./a.out
=========     Host Frame: [0x806b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, float*, float*)
=========     by thread (16,0,0) in block (0,0,0)
=========     Address 0x51000100 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc71b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe40]
=========                in /home/venus/./a.out
=========     Host Frame: [0x806b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, float*, float*)
=========     by thread (17,0,0) in block (0,0,0)
=========     Address 0x51000110 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc71b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe40]
=========                in /home/venus/./a.out
=========     Host Frame: [0x806b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, float*, float*)
=========     by thread (18,0,0) in block (0,0,0)
=========     Address 0x51000120 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc71b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe40]
=========                in /home/venus/./a.out
=========     Host Frame: [0x806b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, float*, float*)
=========     by thread (19,0,0) in block (0,0,0)
=========     Address 0x51000130 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc71b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe40]
=========                in /home/venus/./a.out
=========     Host Frame: [0x806b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, float*, float*)
=========     by thread (20,0,0) in block (0,0,0)
=========     Address 0x51000140 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc71b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe40]
=========                in /home/venus/./a.out
=========     Host Frame: [0x806b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, float*, float*)
=========     by thread (21,0,0) in block (0,0,0)
=========     Address 0x51000150 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc71b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe40]
=========                in /home/venus/./a.out
=========     Host Frame: [0x806b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, float*, float*)
=========     by thread (22,0,0) in block (0,0,0)
=========     Address 0x51000160 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc71b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe40]
=========                in /home/venus/./a.out
=========     Host Frame: [0x806b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, float*, float*)
=========     by thread (23,0,0) in block (0,0,0)
=========     Address 0x51000170 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc71b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe40]
=========                in /home/venus/./a.out
=========     Host Frame: [0x806b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, float*, float*)
=========     by thread (24,0,0) in block (0,0,0)
=========     Address 0x51000180 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc71b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe40]
=========                in /home/venus/./a.out
=========     Host Frame: [0x806b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, float*, float*)
=========     by thread (25,0,0) in block (0,0,0)
=========     Address 0x51000190 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc71b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe40]
=========                in /home/venus/./a.out
=========     Host Frame: [0x806b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, float*, float*)
=========     by thread (26,0,0) in block (0,0,0)
=========     Address 0x510001a0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc71b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe40]
=========                in /home/venus/./a.out
=========     Host Frame: [0x806b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, float*, float*)
=========     by thread (27,0,0) in block (0,0,0)
=========     Address 0x510001b0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc71b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe40]
=========                in /home/venus/./a.out
=========     Host Frame: [0x806b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, float*, float*)
=========     by thread (28,0,0) in block (0,0,0)
=========     Address 0x510001c0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc71b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe40]
=========                in /home/venus/./a.out
=========     Host Frame: [0x806b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, float*, float*)
=========     by thread (29,0,0) in block (0,0,0)
=========     Address 0x510001d0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc71b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe40]
=========                in /home/venus/./a.out
=========     Host Frame: [0x806b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, float*, float*)
=========     by thread (30,0,0) in block (0,0,0)
=========     Address 0x510001e0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc71b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe40]
=========                in /home/venus/./a.out
=========     Host Frame: [0x806b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, float*, float*)
=========     by thread (31,0,0) in block (0,0,0)
=========     Address 0x510001f0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc71b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe40]
=========                in /home/venus/./a.out
=========     Host Frame: [0x806b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Program hit unspecified launch failure (error 719) on CUDA API call to cudaDeviceSynchronize.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame: [0x355b43]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0x3fa17]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf5]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Program hit unspecified launch failure (error 719) on CUDA API call to cudaMemcpy.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame: [0x355b43]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0x5b74d]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7d0f]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
8.55932e+12========= ERROR SUMMARY: 35 errors

Thank for help

After studying the problem I believe it comes from the location of the pointers, I looked at the implementation in cultass and these have the area to solve the problem in part with it’s ASM instruction

// The redundant mov PTX instruction is used to enforce the compiler to
// initialize data to zero before ld.global
template <typename AccessType
         >
struct global_load<AccessType,
                   32 
                  > {
  CUTLASS_DEVICE
  global_load(AccessType &D, void const *ptr, bool pred_guard) {
  uint4 *data = reinterpret_cast<uint4 *>(&D);

  asm volatile(
      "{\n"
      "  .reg .pred p;\n"
      "  setp.ne.b32 p, %9, 0;\n"
      "  mov.b32 %0, %10;\n"
      "  mov.b32 %1, %11;\n"
      "  mov.b32 %2, %12;\n"
      "  mov.b32 %3, %13;\n"
      "  mov.b32 %4, %14;\n"
      "  mov.b32 %5, %15;\n"
      "  mov.b32 %6, %16;\n"
      "  mov.b32 %7, %17;\n"
      "  @p ld.global.v4.u32 {%0, %1, %2, %3}, [%8];\n"
      "  @p ld.global.v4.u32 {%4, %5, %6, %7}, [%18];\n"
      "}\n"
      : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), "=r"(data[0].w),
        "=r"(data[1].x), "=r"(data[1].y), "=r"(data[1].z), "=r"(data[1].w)
      : "l"(ptr), "r"((int)pred_guard), "r"(data[0].x), "r"(data[0].y),
        "r"(data[0].z), "r"(data[0].w), "r"(data[1].x), "r"(data[1].y),
        "r"(data[1].z), "r"(data[1].w), "l"(((uint8_t *)ptr) + 16));
  }
};

if ever someone who reads this and knows about Asm pointer, or cultass I will be grateful to him because I really block and the ptx documentation on the subject and unfortunately very vague, thank you