Is cuda API serial inner the drive level

I have two GPUs, thread t0 run on GPU0, launch kernel and call cudaDeviceSynchronize until the kernel finished, thread t1 run on GPU1, just malloc unified memory and free it.
I found that the cudaDeviceSynchronize run on thread t0 will block cudaFree which run on thread thread t1.
looking forward to any reply.

I don’t why my sample code is by security rules, error code 14, so I post my sample code on the github.

Unified memory doesn’t really have a device location, (in contrast to memory allocated with cudaMalloc). It is migrated to the processor that needs it. So you should purge that kind of thought process.

The unified memory “allocated on device 1” could conceivably be referenced by the kernel running on device 0. Therefore it seems reasonable to me to not free the memory until any kernel that might be using it has finished.

Thanks for your reply, as you advised, I replace the unified memory with cudaMalloc, the program runs well and thread t1 call cudaFree success.
As you said “Unified memory doesn’t really have a device location”. This is my sample code.
If if call cudaMallocManaged before set device, the program program is extremely slow, then if I set device before cudaMallocManaged. the program runs very fast.

#include <stdio.h>
#include <thread>
#include <unistd.h>
#include <iostream>
#include <cuda_runtime.h>
#include <vector>

__global__ void proc(int64_t* ptr[],int len){
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    idx = idx % len;
    for(int i=0;i<128;++i){
        for(int j=0;j<128;++j){
            *ptr[idx] += *ptr[idx];

void proc_thread1(int gpu_id){
    void *array_ptr;
    cudaSetDevice(gpu_id); // very fast
    auto err = cudaMallocManaged(&array_ptr,sizeof(void*)*128);
        std::cout << "proc_thread0 " << err << std::endl;
    int64_t** (data_ptr) = (int64_t**)(array_ptr);
    for(int i=0; i<128; ++i){
        void* ptr=nullptr;
        auto err = cudaMallocManaged(&ptr,8);
            std::cout << "cudaMallocManaged " << err << std::endl;
        data_ptr[i] = (int64_t*)ptr;

    //cudaSetDevice(gpu_id); // extremely slow
        auto start = std::chrono::high_resolution_clock::now();


        auto stop = std::chrono::high_resolution_clock::now();
        auto span = (std::chrono::duration<double, std::milli>(stop - start)).count();
        std::cout << "gpu id = " << gpu_id;
        std::cout << ", cost :" << span << "(ms)" << std::endl;

int main(){

    std::thread th0(&proc_thread1,0);
    std::thread th1(&proc_thread1,1);
    return 0;

It seemed that unified memory has device location.