Parallel processing

I am simulating a scene with a large number of spheres, aiming to move each sphere to a target coordinate.
When the number of objects reaches 1000, the physics calculations drop to 19 steps/sec.
Can this performance be improved by utilizing parallel processing?
I aim to eventually simulate control over 10,000 or more objects.
Below is the code I would like to improve.

from omni.isaac.examples.base_sample import BaseSample
from omni.isaac.dynamic_control import _dynamic_control
from omni.isaac.core.utils.stage import add_reference_to_stage
import numpy as np
import random
import time

class HelloWorld(BaseSample):
    def __init__(self) -> None:
        super().__init__()
        self.goal_range = (-0.5, 0.5)  # 目的地の範囲
        self.update_frequency = 5 # 更新頻度の設定
        self.current_frame = 0  # 現在のフレーム数を追跡
        self.callback_count = 0  # コールバック実行回数
        self.start_time = time.time()  # 開始時刻

    def setup_scene(self):
        world = self.get_world()
        world.scene.add_default_ground_plane()
        self.file_path = "/home/member/Desktop/USD/scene/1000_100_15^2_re.usd"
        add_reference_to_stage(usd_path=self.file_path, prim_path="/World/objects")

        self.goal_positions = [np.array([random.uniform(*self.goal_range), random.uniform(*self.goal_range), 0]) for _ in range(1000)]

    async def setup_post_load(self):
        self._world = self.get_world()
        self._dc = _dynamic_control.acquire_dynamic_control_interface()
        self.pedestrian_handles = [self._dc.get_rigid_body(f"/World/objects/Pedestrian_{i}") for i in range(1000)]
        self._world.add_physics_callback("send_actions", self.send_actions)

    def send_actions(self, step_size):
        self.callback_count += 1
        current_time = time.time()
        if current_time - self.start_time >= 1.0:  # 1秒経過ごとに実行回数をログ出力
            print(f"callback count: {self.callback_count} steps/sec")
            self.callback_count = 0
            self.start_time = current_time
        
        # 目的地制御
        if self.current_frame % self.update_frequency == 0:
            for i, pedestrian_handle in enumerate(self.pedestrian_handles):
                current_position = self._dc.get_rigid_body_pose(pedestrian_handle).p
                goal_pos = self.goal_positions[i]
                direction = goal_pos - current_position
                distance = np.linalg.norm(direction)
                if distance > 0.1:
                    direction = direction / distance
                    velocity = direction * 1.0
                    self._dc.set_rigid_body_linear_velocity(pedestrian_handle, velocity)
                else:
                    self._dc.set_rigid_body_linear_velocity(pedestrian_handle, np.array([0, 0, 0]))
        self.current_frame += 1

When I tried parallel processing, the steps/sec decreased (though it might be due to my code). I have also tried other methods to reduce computational load, such as grouping objects and applying forces at specific frames.

I optimized the computational part and modified the code as follows. However, the part “current_positions = np.array([self._dc.get_rigid_body_pose(handle).p for handle in self.pedestrian_handles])” has become a bottleneck, slowing down the process.
Does this part support batch processing or multithreading?

    def send_actions(self, step_size):
        self._dc.set_rigid_body_disable_simulation(self.pedestrian_orijinal, True)
        self._dc.set_rigid_body_disable_simulation(self.obstacle_orijinal, True)
        
        self.callback_count += 1
        current_time = time.time()
        if current_time - self.start_time >= 1.0:  # 1秒経過ごとに実行回数をログ出力
            print(f"callback count: {self.callback_count} steps/sec")
            self.callback_count = 0
            self.start_time = current_time
        
        #目的地制御(ベクトル演算)
        if self.current_frame % self.update_frequency == 0:
            current_positions = np.array([self._dc.get_rigid_body_pose(handle).p for handle in self.pedestrian_handles])
            goal_positions = np.array(self.goal_positions)
            
            directions = goal_positions - current_positions # 目的地までの方向ベクトルを計算
            norms = np.linalg.norm(directions, axis=1, keepdims=True) # 各方向ベクトルの大きさを計算
            normalized_directions = np.where(norms > 0, directions / norms, 0) # 方向ベクトルを正規化
            velocities = normalized_directions * 1.0  # 速度ベクトルを設定
        
            # 目的地に十分近づいたか判断するための距離の二乗
            squared_distances = np.sum(directions**2, axis=1)
            threshold_squared = 0.1**2
        
            # 各オブジェクトに対して速度を設定
            for i, pedestrian_handle in enumerate(self.pedestrian_handles):
                if squared_distances[i] > threshold_squared:
                    self._dc.set_rigid_body_linear_velocity(pedestrian_handle, velocities[i])
                else:
                    self._dc.set_rigid_body_linear_velocity(pedestrian_handle, np.array([0, 0, 0]))

        self.current_frame += 1