DOCA DPA Threads not waking up

Hello, I’m trying to start DPA threads using a completion context. However, they are simply not waking up. I’ve provided a simplified version of my code here. I am setting up a thread and completion context, then attaching the completion context to the thread, and attaching the async op object to the completion context. I am attaching a notification comp context to the thread and sending a notification to that notification comp, but nothing happens with the thread. Am I doing this wrong? Would someone please take a quick look at my steps and determine if there is something I am missing? Thank you.

Host side (error checking omitted here but passes successfully in my code):

struct thread_arg {
	doca_dpa_dev_async_ops_t async_ops_handle;
	doca_dpa_dev_completion_t dpa_comp_handle;
	doca_dpa_dev_mmap_t src_mmap_handle;
	doca_dpa_dev_mmap_t dst_mmap_handle;
	doca_dpa_dev_uintptr_t src_addr;
	doca_dpa_dev_uintptr_t dst_addr;
	size_t total_size;
	size_t chunk_size;
	doca_dpa_dev_uintptr_t runtime_ptr;
	doca_dpa_dev_sync_event_t comp_sync_event;
	uint64_t comp_sync_event_val;
};

struct doca_sync_event *comp_event = NULL;
doca_dpa_dev_sync_event_t comp_event_handle;
uint64_t comp_event_val = 10;
struct doca_dpa_thread *dpa_thread = NULL;
struct doca_dpa_async_ops *async_ops = NULL;
struct doca_dpa_completion *dpa_comp = NULL;
struct doca_dpa_notification_completion *notify_comp = NULL;
doca_dpa_dev_async_ops_t async_ops_handle;
doca_dpa_dev_completion_t dpa_comp_handle;
doca_dpa_dev_notification_completion_t notify_comp_handle;
doca_dpa_dev_mmap_t src_mmap_handle, dst_mmap_handle;
doca_dpa_dev_uintptr_t thread_arg_dev_ptr = 0;
struct thread_arg targ = {0};

doca_dpa_dev_uintptr_t thread_arg_dev_ptr = 0;
doca_dpa_thread_create(resources->pf_dpa_ctx, &dpa_thread);
doca_dpa_mem_alloc(resources->pf_dpa_ctx, sizeof(struct thread_arg), &thread_arg_dev_ptr);
doca_dpa_thread_set_func_arg(dpa_thread, &async_memcpy_thread, thread_arg_dev_ptr);
doca_dpa_thread_start(dpa_thread);
doca_dpa_notification_completion_create(resources->pf_dpa_ctx, dpa_thread, &notify_comp);
doca_dpa_notification_completion_start(notify_comp);
doca_dpa_notification_completion_get_dpa_handle(notify_comp, &notify_comp_handle);
doca_dpa_notification_completion_get_thread(notify_comp, &notify_thread);
doca_dpa_completion_create(resources->pf_dpa_ctx, 8, &dpa_comp);
doca_dpa_completion_set_thread(dpa_comp, dpa_thread);
doca_dpa_async_ops_create(resources->pf_dpa_ctx, 16, 0, &async_ops);
doca_dpa_async_ops_attach(async_ops, dpa_comp);
doca_dpa_completion_start(dpa_comp);
doca_dpa_async_ops_get_dpa_handle(async_ops, &async_ops_handle);
doca_dpa_completion_get_dpa_handle(dpa_comp, &dpa_comp_handle);
create_doca_dpa_completion_sync_event(resources->pf_dpa_ctx,
	                                               resources->pf_doca_device,
	                                               &comp_event,
	                                               &comp_event_handle);
doca_dpa_thread_run(dpa_thread);
targ.async_ops_handle = async_ops_handle;
targ.dpa_comp_handle = dpa_comp_handle;
targ.src_mmap_handle = src_mmap_handle;
targ.dst_mmap_handle = dst_mmap_handle;
targ.src_addr = DPA_arr;
targ.dst_addr = DPA_arr2;
targ.total_size = arr_volume;
targ.chunk_size = CHUNK_SIZE;
targ.runtime_ptr = DPA_runtime;
targ.comp_sync_event = comp_event_handle;
targ.comp_sync_event_val = comp_event_val;
doca_dpa_h2d_memcpy(resources->pf_dpa_ctx, thread_arg_dev_ptr, &targ, sizeof(targ));
doca_dpa_rpc(resources->pf_dpa_ctx, &wake_thread_rpc, &rpc_result, notify_comp_handle);
doca_dpa_rpc(resources->pf_dpa_ctx, &trigger_async_memcpy_rpc, &rpc_result, targ);

Device side:

__dpa_rpc__ uint64_t wake_thread_rpc(doca_dpa_dev_notification_completion_t notify_handle)
{
	DOCA_DPA_DEV_LOG_INFO("[RPC-WAKE]: Waking thread via notification...\n");
	doca_dpa_dev_thread_notify(notify_handle);
	DOCA_DPA_DEV_LOG_INFO("[RPC-WAKE]: Got here\n");
	return 0;
}

__dpa_rpc__ uint64_t trigger_async_memcpy_rpc(thread_arg_t targ)
{
	
	size_t bytes_copied = 0;
	size_t remaining = targ.total_size;
	uint32_t ops_issued = 0;
	
	while (bytes_copied < targ.total_size) {
		size_t copy_size = (remaining >= targ.chunk_size) ? targ.chunk_size : remaining;
		
		uint32_t flags = (bytes_copied + copy_size >= targ.total_size) 
			? DOCA_DPA_DEV_SUBMIT_FLAG_FLUSH
			: DOCA_DPA_DEV_SUBMIT_FLAG_OPTIMIZE_REPORTS;
		
		doca_dpa_dev_post_memcpy(
			targ.async_ops_handle,
			targ.dst_mmap_handle,
			targ.dst_addr + bytes_copied,
			targ.src_mmap_handle,
			targ.src_addr + bytes_copied,
			copy_size,
			flags
		);
		
		ops_issued++;
		bytes_copied += copy_size;
		remaining -= copy_size;
	}
	
	
	return 0;
}

__dpa_global__ void async_memcpy_thread(uint64_t arg) {
    DOCA_DPA_DEV_LOG_INFO("This never prints\n");
}

Hi Denischen242

You’re basically trying to do the same thing that the official DPA “Hello World” example does: start a DPA thread, attach a notification completion to it, and then wake it from an RPC.

The sequence in the docs looks like this (Hello World Example & Notification Completion section):

  • Create and start the DPA context, and set the DPA app
    (doca_dpa_create(), doca_dpa_set_app(), doca_dpa_start()).

  • Create the DPA thread and set its kernel + argument
    (doca_dpa_thread_create(), doca_dpa_thread_set_func_arg(), doca_dpa_thread_start()).

  • Create a DPA notification completion attached to that thread and start it
    (doca_dpa_notification_completion_create(dpa_ctx, dpa_thread, &notify_comp),
    doca_dpa_notification_completion_start(notify_comp),
    doca_dpa_notification_completion_get_dpa_handle(notify_comp, &notify_comp_handle)).

  • Only after both the thread and the notification completion are started, call
    doca_dpa_thread_run(dpa_thread).

  • From the device side RPC, call
    doca_dpa_dev_thread_notify(notify_comp_handle)
    – that’s what actually schedules the thread. The example kernel then logs a message and calls doca_dpa_dev_thread_finish() when done.

    The full step‑by‑step flow (with host code + device code) is here in the docs:

Given your snippet, I may suggest:

Compare your order of calls 1:1 against that example (especially: doca_dpa_set_app / doca_dpa_start, doca_dpa_thread_start, notification completion *_start, and then doca_dpa_thread_run last).

  • Make sure the thread kernel you want to run is the one actually registered in the DPA app (as in the example, via doca_dpa_func_t), and that your RPC uses doca_dpa_dev_thread_notify() on the notification completion handle you got from the host.

Once your flow matches the Hello World example and you see the device‑side log from the thread, you can then layer your async‑ops / completion‑context logic on top.

Thanks

xyin