From 8d298667bc1c16753cdd1b1d06f0fdc809469be3 Mon Sep 17 00:00:00 2001 From: sufubao Date: Thu, 2 Apr 2026 12:56:41 +0000 Subject: [PATCH 1/2] fix: prevent orphan VIT model processes on crash or interrupt VIT model inference processes spawned by the visual server had no mechanism to detect parent death and self-terminate. The process handles were also discarded immediately after spawning, making explicit cleanup impossible. - Add start_parent_check_thread() to VIT model worker processes so they monitor the visual server and self-terminate if it dies - Return process handles from start_model_process() and store them in VisualManager and VisualOnlyManager - Implement clean_up() to kill tracked VIT processes on shutdown --- lightllm/server/visualserver/manager.py | 11 +++++++++-- lightllm/server/visualserver/model_infer/__init__.py | 4 +++- lightllm/server/visualserver/visual_only_manager.py | 11 +++++++++-- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/lightllm/server/visualserver/manager.py b/lightllm/server/visualserver/manager.py index a165be78f2..a42fa98e1c 100644 --- a/lightllm/server/visualserver/manager.py +++ b/lightllm/server/visualserver/manager.py @@ -9,6 +9,7 @@ import setproctitle import threading import collections +import multiprocessing from typing import List from lightllm.server.core.objs.io_objs.group_req import GroupReqIndexes from lightllm.server.core.objs import ShmReqManager, StartArgs @@ -62,12 +63,14 @@ def __init__( async def wait_to_model_ready(self): self.model_rpcs: List[List[VisualModelRpcClient]] = [[] for _ in range(self.vit_dp)] + self.model_procs: List[multiprocessing.Process] = [] self.vit_attn_backend = init_vit_att_backend(index=0) for dp_rank_id in range(self.vit_dp): for tp_rank_id in range(self.vit_tp): - rpc_model = await start_model_process() + rpc_model, proc = await start_model_process() self.model_rpcs[dp_rank_id].append(rpc_model) + self.model_procs.append(proc) init_model_ret = [] for dp_rank_id in range(self.vit_dp): # async init model process @@ -187,7 +190,11 @@ async def loop_for_netio_req(self): logger.exception(str(e)) def clean_up(self): - return + for proc in getattr(self, "model_procs", []): + if proc.is_alive(): + logger.info(f"Killing VIT model process {proc.pid}") + proc.kill() + proc.join(timeout=5) def start_visual_process(args, pipe_writer): diff --git a/lightllm/server/visualserver/model_infer/__init__.py b/lightllm/server/visualserver/model_infer/__init__.py index ae3c4204db..db61407fd7 100644 --- a/lightllm/server/visualserver/model_infer/__init__.py +++ b/lightllm/server/visualserver/model_infer/__init__.py @@ -9,6 +9,7 @@ from rpyc.utils.classic import obtain from rpyc.utils.server import ThreadedServer from lightllm.utils.graceful_utils import graceful_registry +from lightllm.utils.process_check import start_parent_check_thread from lightllm.utils.envs_utils import get_env_start_args from .model_rpc_client import VisualModelRpcClient from .model_rpc import VisualModelRpcServer @@ -18,6 +19,7 @@ def _init_env(socket_path: str, success_event): # 注册graceful 退出的处理 graceful_registry(inspect.currentframe().f_code.co_name) + start_parent_check_thread() import lightllm.utils.rpyc_fix_utils as _ @@ -52,7 +54,7 @@ async def start_model_process(): # 服务端需要调用客户端传入的event所以,客户端需要一个后台线程进行相关的处理。 conn._bg_thread = rpyc.BgServingThread(conn, sleep_interval=0.001) - return VisualModelRpcClient(conn) + return VisualModelRpcClient(conn), proc def _generate_unix_socket_path() -> str: diff --git a/lightllm/server/visualserver/visual_only_manager.py b/lightllm/server/visualserver/visual_only_manager.py index 27275c1e8c..c90a80f86a 100644 --- a/lightllm/server/visualserver/visual_only_manager.py +++ b/lightllm/server/visualserver/visual_only_manager.py @@ -12,6 +12,7 @@ import os import signal import time +import multiprocessing from lightllm.utils.net_utils import get_hostname_ip from .objs import VIT_Obj from typing import List @@ -94,12 +95,14 @@ async def register_to_config_server_loop(self, args: StartArgs): async def wait_to_model_ready(self): self.model_rpcs: List[List[VisualModelRpcClient]] = [[] for _ in range(self.vit_dp)] + self.model_procs: List[multiprocessing.Process] = [] self.vit_attn_backend = init_vit_att_backend(index=0) for dp_rank_id in range(self.vit_dp): for tp_rank_id in range(self.vit_tp): - rpc_model = await start_model_process() + rpc_model, proc = await start_model_process() self.model_rpcs[dp_rank_id].append(rpc_model) + self.model_procs.append(proc) init_model_ret = [] for dp_rank_id in range(self.vit_dp): # async init model process @@ -130,7 +133,11 @@ async def infer_images(self, dp_index: int, images, events): await VisualManager.infer_images(self, dp_index=dp_index, images=images, events=events) def clean_up(self): - return + for proc in getattr(self, "model_procs", []): + if proc.is_alive(): + logger.info(f"Killing VIT model process {proc.pid}") + proc.kill() + proc.join(timeout=5) def exposed_remote_infer_images(self, images: List[ImageItem], ref_event: threading.Event): try: From 539e73330dcd8a1c0e1cc49d68dabc2a8fe66fa9 Mon Sep 17 00:00:00 2001 From: sufubao Date: Fri, 3 Apr 2026 05:20:33 +0000 Subject: [PATCH 2/2] fix again --- lightllm/server/visualserver/manager.py | 11 +++++++---- lightllm/server/visualserver/visual_only_manager.py | 11 +++++++---- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/lightllm/server/visualserver/manager.py b/lightllm/server/visualserver/manager.py index a42fa98e1c..d2a27fda62 100644 --- a/lightllm/server/visualserver/manager.py +++ b/lightllm/server/visualserver/manager.py @@ -191,10 +191,13 @@ async def loop_for_netio_req(self): def clean_up(self): for proc in getattr(self, "model_procs", []): - if proc.is_alive(): - logger.info(f"Killing VIT model process {proc.pid}") - proc.kill() - proc.join(timeout=5) + try: + if proc.is_alive(): + logger.info(f"Killing VIT model process {proc.pid}") + proc.kill() + proc.join(timeout=5) + except (ProcessLookupError, OSError): + pass def start_visual_process(args, pipe_writer): diff --git a/lightllm/server/visualserver/visual_only_manager.py b/lightllm/server/visualserver/visual_only_manager.py index c90a80f86a..5a8a9981a0 100644 --- a/lightllm/server/visualserver/visual_only_manager.py +++ b/lightllm/server/visualserver/visual_only_manager.py @@ -134,10 +134,13 @@ async def infer_images(self, dp_index: int, images, events): def clean_up(self): for proc in getattr(self, "model_procs", []): - if proc.is_alive(): - logger.info(f"Killing VIT model process {proc.pid}") - proc.kill() - proc.join(timeout=5) + try: + if proc.is_alive(): + logger.info(f"Killing VIT model process {proc.pid}") + proc.kill() + proc.join(timeout=5) + except (ProcessLookupError, OSError): + pass def exposed_remote_infer_images(self, images: List[ImageItem], ref_event: threading.Event): try: