Error cp.async.ca.shared.global

hello i’m trying to optimize my cuda code using cp.async.ca.shared.global but i get an error that i can’t fix

test code

#include <stdint.h>

__global__ void Test(float *nn, uint32_t *nn16){
    __shared__ float s_copy[32][4];
    int SizeInBytes = 16;
    asm volatile("cp.async.ca.shared.global [%1], [%2], %3;\n" :: "f"(s_copy[threadIdx.x][0]), "f"(nn[threadIdx.x + 4]), "n"(SizeInBytes));
    asm volatile("cp.async.wait_all;\n" ::);
}

int main() {
    float* h_nn = (float*)malloc(393216);
    for (int i = 0; i < 393216; i = i + 2){
        h_nn[i] = 0.11;
        h_nn[i+1] = -0.11;
    }

    // malloc cuda
    float* d_nn;
    cudaMalloc(&d_nn, 393216);
    uint32_t* d_nn16;
    cudaMalloc(&d_nn16, 196608);
    
    // copy
    cudaMemcpy(h_nn, d_nn, 393216, cudaMemcpyHostToDevice);

    Test<<<1, 32>>>(d_nn, d_nn16);
    cudaDeviceSynchronize();
}

error:

Forum_copy.cu(6): error: an asm operand must be an integral constant expression.

Thank for help

Is there some reason you don’t want to use the CUDA C++ capability?

To address your question directly, I would point out several things. You may wish to refer to the inline assembly guide for details and examples.

  • as the error indicates, the last parameter must be a compile-time constant, so add const for example
  • operand numbering starts with zero
  • you are passing address operands incorrectly. They should look like a pointer, not a value
  • address operands should not be passed with the f constraint (a floating point register) but instead the l constraint (a 64-bit register).

The following addresses the above issues and compiles cleanly for me. I haven’t tested it:

$ cat t134.cu
#include <stdint.h>

__global__ void Test(float *nn, uint32_t *nn16){
    __shared__ float s_copy[32][4];
    const int SizeInBytes = 16;
    asm volatile("cp.async.ca.shared.global [%0], [%1], %2;\n" :: "l"(s_copy[threadIdx.x]), "l"(nn+threadIdx.x + 4), "n"(SizeInBytes));
    asm volatile("cp.async.wait_all;\n" ::);
}

int main() {
    float* h_nn = (float*)malloc(393216);
    for (int i = 0; i < 393216; i = i + 2){
        h_nn[i] = 0.11;
        h_nn[i+1] = -0.11;
    }

    // malloc cuda
    float* d_nn;
    cudaMalloc(&d_nn, 393216);
    uint32_t* d_nn16;
    cudaMalloc(&d_nn16, 196608);

    // copy
    cudaMemcpy(h_nn, d_nn, 393216, cudaMemcpyHostToDevice);

    Test<<<1, 32>>>(d_nn, d_nn16);
    cudaDeviceSynchronize();
}
$ nvcc -arch=sm_80 -o t134 t134.cu
$

For reference, the instruction in question is documented here.

To repeat, I haven’t tested it, and your intent is not entirely clear. For example you wrote: nn[threadIdx.x + 4] but you may have meant nn[threadIdx.x * 4]. There may be other issues as well.

1 Like

thank for help, Robert_Crovella

I have one last problem, I feel like my function stops executing after the instruction

code :

#include <stdint.h>
#include <stdio.h>
#include

__global__ void Test(float *nn, uint32_t *nn16, float *out){
    __shared__ float s_copy[32][4];
    const int SizeInBytes = 16;
    asm volatile("cp.async.ca.shared.global [%0], [%1], %2;\n" :: "l"(s_copy[threadIdx.x]), "l"(nn+threadIdx.x * 4), "n"(SizeInBytes));
    asm volatile("cp.async.wait_all;\n" ::);
    if (threadIdx.x == 0)
      memcpy(out, s_copy[0], 4);
}

int main() {
    float* h_C = (float*)malloc(4*sizeof(float));
    float* h_nn = (float*)malloc(393216*sizeof(float));
    for (int i = 0; i < 393216; i = i+1){
        h_nn[i] = 0.11;
    }

    // malloc cuda
    float* d_nn;
    cudaMalloc(&d_nn, 393216);
    uint32_t* d_nn16;
    cudaMalloc(&d_nn16, 196608);

    // out
    float* d_C;
    cudaMalloc(&d_C, 4*sizeof(float));
    // copy
    cudaMemcpy(h_nn, d_nn, 393216, cudaMemcpyHostToDevice);

    Test<<<1, 32>>>(d_nn, d_nn16, d_C);
    cudaDeviceSynchronize();
    cudaMemcpy(h_C, d_C, 4*sizeof(float), cudaMemcpyDeviceToHost);
    std::cout << h_C[0];
}

the returned value is 0 and not 0.11

and if i do a printf above the cp.async.ca.shared.global statement this one and display and below this one is not displayed.

by reading the documentation I do not understand where my error comes from, thank

what happens when you run your code with compute-sanitizer? Are any errors reported?

excuse me i don’t know how compute-sanitizer used, Are there a tutorial because I can not start from the documentation

thank

here is the documentation. As indicated there, the most basic usage is:

compute-sanitizer ./my_executable

when you run your code (change my_executable to the name of your compiled code/application) that way, what is the output?

I think I have an installation problem because
compute-sanitizer: command not found

nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Mon_Oct_12_20:09:46_PDT_2020
Cuda compilation tools, release 11.1, V11.1.105

I will redo my install and run the command
thank

You should see something like this on a proper CUDA 11.1 linux install:

$ /usr/local/cuda-11.1/bin/compute-sanitizer --help
NVIDIA (R) Compute Sanitizer
Copyright (c) 2020-2020 NVIDIA Corporation
Version 2020.2.1
Usage: compute-sanitizer [options] [your-program] [your-program-options]

General options:
  -h [ --help ]                         Produce this help message.
  -v [ --version ]                      Print the version number.
  --binary-patching arg (=yes)          Control the binary patching of the device code.
                                        Disabling this option will result in a loss of precision for error reporting.
  --check-api-memory-access arg (=yes)  Check cudaMemcpy/cudaMemset for accesses to device memory
  --check-device-heap arg (=yes)        Check allocation on the device heap.
  --check-exit-code arg (=yes)          Check application exit code and print an error if it is different than 0
  --demangle arg (=full)                Demangle function names.
                                        full   : Show demangled function name, including parameters, templates, etc
                                        simple : Show demangled function name, without parameters, templates, etc
                                        no     : Show mangled function name
  --destroy-on-device-error arg (=context)
                                        Behavior of compute-sanitizer on a precise device error.
                                        NOTE: Imprecise errors will always destroy the context.
                                        context : CUDA context is terminated with an error.
                                        kernel  : Kernel is terminated. Subsequent kernel launches are still allowed.
  --error-exitcode arg                  When this is set, compute-sanitizer will return the given exitcode when any errors are detected.
  --exclude arg                         key1=val1,key2=val2,...
                                        The exclude option can be used to control the kernels that will not be checked by the tool
                                        Multiple exclude options can be defined. Each option is additive, so kernels matching
                                        any specified pattern will be excluded
                                        If this option is specified with filter option(s), only kernels matching at least one
                                        filter pattern and not matching any exclude patterns will be checked.
                                        Exclusions are specified as key value pairs, with each pair separated by a ','
                                        Keys have both a long form, and a shorter form for convenience.
                                        Valid values for keys are:
                                            kernel_name, kne      : The value is the full mangled name of the kernel
                                            kernel_substring, kns : The value is a substring in the mangled name of the kernel
  --filter arg                          key1=val1,key2=val2,...
                                        The filter option can be used to control the kernels that will be checked by the tool
                                        Multiple filter options can be defined. Each option is additive, so kernels matching
                                        any specified filter will be checked
                                        Filters are specified as key value pairs, with each pair separated by a ','
                                        Keys have both a long form, and a shorter form for convenience.
                                        Valid values for keys are:
                                            kernel_name, kne      : The value is the full mangled name of the kernel
                                            kernel_substring, kns : The value is a substring in the mangled name of the kernel
  --force-blocking-launches arg (=no)   Force launches to be blocking.
  --force-synchronization-limit arg (=0)
                                        Set the maximum number of launches occurring on a given stream without forcing a synchronization.
                                        A lower value can help reducing tools usage of device memory at the cost of performances.
                                        A zero value sets no limit. A one value is equivalent to --force-blocking-launches.
  --injection-path arg                  Set the path to injection libraries.
  --language arg (=c)                   <c|fortran> This option can be used to enable language specific behavior. When set to fortran, the thread and block indices
                                        of messages printed by compute-sanitizer will start with 1-based offset to match Fortran semantics.
  --launch-timeout arg (=10)            Timeout in seconds to wait to connect to the target process. A value of zero forces compute-sanitizer to wait infinitely.
  --log-file arg                        File where compute-sanitizer will write all of its text output. If not specified, compute-sanitizer output is written to stdout.
                                        The sequence %p in the string name will be replaced by the pid of the compute-sanitizer application.
                                        The sequence %q{FOO} will be replaced by the value of the environment variable FOO. If the environment variable
                                        is not defined, it will be replaced by an empty string
                                        The sequence %% is replaced with a literal % in the file name.
                                        Any other character following % will cause an error.
  --max-connections arg (=10)           Maximum number of ports for connecting to target application
  --kill arg (=yes)                     Makes the compute-sanitizer kill the target application when having a communication error.
  --port arg (=49152)                   Base port for connecting to target application
  --prefix arg (==========)             Changes the prefix string displayed by compute-sanitizer.
  --print-level arg (=warn)             <info|warn|error|fatal> Set the minimum level of errors to print.
  --print-limit arg (=10000)            When this is set, compute-sanitizer will stop printing errors after reaching the given number of errors.
                                        Use 0 for unlimited printing.
  --read arg                            Reads error records from a given file, previously generated with --save.
  --save arg                            Saves the error record to file.
                                        The sequence %p in the string name will be replaced by the pid of the compute-sanitizer application.
                                        The sequence %q{FOO} will be replaced by the value of the environment variable FOO. If the environment variable
                                        is not defined, it will be replaced by an empty string.
                                        The sequence %% is replaced by a literal % in the file name.
                                        Any other character following % will cause an error.
  --show-backtrace arg (=yes)           Display a backtrace on error.
                                        no     : No backtrace shown
                                        host   : Only host backtrace shown
                                        device : Only device backtrace shown for precise errors
                                        yes    : Host and device backtraces shown
                                        See the manual for more information
  --target-processes arg (=application-only)
                                        Select which processes are to be tracked by compute-sanitizer:
                                        application-only : Track only the root application process
                                        all              : Track the root application and all its child processes
  --tool arg (=memcheck)                Set the tool to use.
                                        memcheck  : Memory access checking
                                        racecheck : Shared memory hazard checking
                                        synccheck : Synchronization checking
                                        initcheck : Global memory initialization checking

Memcheck-specific options:
  --report-api-errors arg (=explicit)   Print errors if any API call fails.
                                        all      : Report all CUDA API errors, including APIs invoked implicitly
                                        explicit : Report errors in explicit CUDA API calls only
                                        no       : Disable reporting of CUDA API errors
  --leak-check arg (=no)                <full|no> Print leak information for CUDA allocations.
                                        NOTE: Program must end with cudaDeviceReset() for this to work.

Racecheck-specific options:
  --racecheck-num-hazards arg (=32768)  Maximum number of racecheck hazards to record per-launch for a single SM.
  --racecheck-report arg (=analysis)    The reporting mode that applies to racecheck.
                                        all      : Report all hazards and race analysis reports.
                                        hazard   : Report only hazards.
                                        analysis : Report only race analysis reports.

Initcheck-specific options:
  --track-unused-memory arg (=no)       Check for unused memory allocations.

Please see the compute-sanitizer manual for more information.

$

Sorry for the delay I used to install cuda from lambda stack, I finally managed to use compute-sanitizer

error:

========= COMPUTE-SANITIZER
========= Program hit invalid argument (error 1) on CUDA API call to cudaMemcpy.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame: [0x355b43]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0x5b77d]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7c77]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, unsigned int*, float*)
=========     by thread (0,0,0) in block (0,0,0)
=========     Address 0x3e000000 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc74b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe70]
=========                in /home/venus/./a.out
=========     Host Frame: [0x8090]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, unsigned int*, float*)
=========     by thread (1,0,0) in block (0,0,0)
=========     Address 0x3e000010 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc74b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe70]
=========                in /home/venus/./a.out
=========     Host Frame: [0x8090]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, unsigned int*, float*)
=========     by thread (2,0,0) in block (0,0,0)
=========     Address 0x3e000020 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc74b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe70]
=========                in /home/venus/./a.out
=========     Host Frame: [0x8090]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, unsigned int*, float*)
=========     by thread (3,0,0) in block (0,0,0)
=========     Address 0x3e000030 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc74b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe70]
=========                in /home/venus/./a.out
=========     Host Frame: [0x8090]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, unsigned int*, float*)
=========     by thread (4,0,0) in block (0,0,0)
=========     Address 0x3e000040 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc74b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe70]
=========                in /home/venus/./a.out
=========     Host Frame: [0x8090]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, unsigned int*, float*)
=========     by thread (5,0,0) in block (0,0,0)
=========     Address 0x3e000050 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc74b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe70]
=========                in /home/venus/./a.out
=========     Host Frame: [0x8090]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, unsigned int*, float*)
=========     by thread (6,0,0) in block (0,0,0)
=========     Address 0x3e000060 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc74b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe70]
=========                in /home/venus/./a.out
=========     Host Frame: [0x8090]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, unsigned int*, float*)
=========     by thread (7,0,0) in block (0,0,0)
=========     Address 0x3e000070 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc74b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe70]
=========                in /home/venus/./a.out
=========     Host Frame: [0x8090]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, unsigned int*, float*)
=========     by thread (8,0,0) in block (0,0,0)
=========     Address 0x3e000080 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc74b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe70]
=========                in /home/venus/./a.out
=========     Host Frame: [0x8090]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, unsigned int*, float*)
=========     by thread (9,0,0) in block (0,0,0)
=========     Address 0x3e000090 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc74b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe70]
=========                in /home/venus/./a.out
=========     Host Frame: [0x8090]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, unsigned int*, float*)
=========     by thread (10,0,0) in block (0,0,0)
=========     Address 0x3e0000a0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc74b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe70]
=========                in /home/venus/./a.out
=========     Host Frame: [0x8090]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, unsigned int*, float*)
=========     by thread (11,0,0) in block (0,0,0)
=========     Address 0x3e0000b0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc74b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe70]
=========                in /home/venus/./a.out
=========     Host Frame: [0x8090]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, unsigned int*, float*)
=========     by thread (12,0,0) in block (0,0,0)
=========     Address 0x3e0000c0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc74b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe70]
=========                in /home/venus/./a.out
=========     Host Frame: [0x8090]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, unsigned int*, float*)
=========     by thread (13,0,0) in block (0,0,0)
=========     Address 0x3e0000d0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc74b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe70]
=========                in /home/venus/./a.out
=========     Host Frame: [0x8090]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, unsigned int*, float*)
=========     by thread (14,0,0) in block (0,0,0)
=========     Address 0x3e0000e0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc74b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe70]
=========                in /home/venus/./a.out
=========     Host Frame: [0x8090]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, unsigned int*, float*)
=========     by thread (15,0,0) in block (0,0,0)
=========     Address 0x3e0000f0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc74b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe70]
=========                in /home/venus/./a.out
=========     Host Frame: [0x8090]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, unsigned int*, float*)
=========     by thread (16,0,0) in block (0,0,0)
=========     Address 0x3e000100 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc74b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe70]
=========                in /home/venus/./a.out
=========     Host Frame: [0x8090]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, unsigned int*, float*)
=========     by thread (17,0,0) in block (0,0,0)
=========     Address 0x3e000110 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc74b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe70]
=========                in /home/venus/./a.out
=========     Host Frame: [0x8090]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, unsigned int*, float*)
=========     by thread (18,0,0) in block (0,0,0)
=========     Address 0x3e000120 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc74b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe70]
=========                in /home/venus/./a.out
=========     Host Frame: [0x8090]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, unsigned int*, float*)
=========     by thread (19,0,0) in block (0,0,0)
=========     Address 0x3e000130 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc74b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe70]
=========                in /home/venus/./a.out
=========     Host Frame: [0x8090]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, unsigned int*, float*)
=========     by thread (20,0,0) in block (0,0,0)
=========     Address 0x3e000140 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc74b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe70]
=========                in /home/venus/./a.out
=========     Host Frame: [0x8090]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, unsigned int*, float*)
=========     by thread (21,0,0) in block (0,0,0)
=========     Address 0x3e000150 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc74b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe70]
=========                in /home/venus/./a.out
=========     Host Frame: [0x8090]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, unsigned int*, float*)
=========     by thread (22,0,0) in block (0,0,0)
=========     Address 0x3e000160 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc74b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe70]
=========                in /home/venus/./a.out
=========     Host Frame: [0x8090]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, unsigned int*, float*)
=========     by thread (23,0,0) in block (0,0,0)
=========     Address 0x3e000170 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc74b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe70]
=========                in /home/venus/./a.out
=========     Host Frame: [0x8090]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, unsigned int*, float*)
=========     by thread (24,0,0) in block (0,0,0)
=========     Address 0x3e000180 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc74b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe70]
=========                in /home/venus/./a.out
=========     Host Frame: [0x8090]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, unsigned int*, float*)
=========     by thread (25,0,0) in block (0,0,0)
=========     Address 0x3e000190 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc74b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe70]
=========                in /home/venus/./a.out
=========     Host Frame: [0x8090]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, unsigned int*, float*)
=========     by thread (26,0,0) in block (0,0,0)
=========     Address 0x3e0001a0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc74b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe70]
=========                in /home/venus/./a.out
=========     Host Frame: [0x8090]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, unsigned int*, float*)
=========     by thread (27,0,0) in block (0,0,0)
=========     Address 0x3e0001b0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc74b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe70]
=========                in /home/venus/./a.out
=========     Host Frame: [0x8090]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, unsigned int*, float*)
=========     by thread (28,0,0) in block (0,0,0)
=========     Address 0x3e0001c0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc74b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe70]
=========                in /home/venus/./a.out
=========     Host Frame: [0x8090]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, unsigned int*, float*)
=========     by thread (29,0,0) in block (0,0,0)
=========     Address 0x3e0001d0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc74b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe70]
=========                in /home/venus/./a.out
=========     Host Frame: [0x8090]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, unsigned int*, float*)
=========     by thread (30,0,0) in block (0,0,0)
=========     Address 0x3e0001e0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc74b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe70]
=========                in /home/venus/./a.out
=========     Host Frame: [0x8090]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Invalid __shared__ write of size 16 bytes
=========     at 0x90 in Test(float*, unsigned int*, float*)
=========     by thread (31,0,0) in block (0,0,0)
=========     Address 0x3e0001f0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x25428a]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0xc74b]
=========                in /home/venus/./a.out
=========     Host Frame: [0x5fe70]
=========                in /home/venus/./a.out
=========     Host Frame: [0x8090]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7ee3]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7f3a]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf0]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Program hit unspecified launch failure (error 719) on CUDA API call to cudaDeviceSynchronize.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame: [0x355b43]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0x3fa47]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7cf5]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
========= Program hit unspecified launch failure (error 719) on CUDA API call to cudaMemcpy.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame: [0x355b43]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0x5b77d]
=========                in /home/venus/./a.out
=========     Host Frame: [0x7d0f]
=========                in /home/venus/./a.out
=========     Host Frame:__libc_start_main [0x270b3]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame: [0x7ace]
=========                in /home/venus/./a.out
========= 
30.6386========= ERROR SUMMARY: 35 errors

thank

well, that is why your code is not doing what you thought is should

thank Robert_Crovella

I don’t really understand the error unfortunately

So why the shared memory pointer is out of bounds? How to correctly pass the shared memory pointer to the asm?

see here