diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..ff86f24 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +*.pt filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text diff --git a/data/COLLABORA_02_RGB.png b/data/COLLABORA_02_RGB.png new file mode 100644 index 0000000..46cd1f1 Binary files /dev/null and b/data/COLLABORA_02_RGB.png differ diff --git a/data/Chinedu-Obasi_2684938.jpg b/data/Chinedu-Obasi_2684938.jpg new file mode 100644 index 0000000..f60a259 Binary files /dev/null and b/data/Chinedu-Obasi_2684938.jpg differ diff --git a/demo/football/README.md b/demo/football/README.md new file mode 100644 index 0000000..7a33266 --- /dev/null +++ b/demo/football/README.md @@ -0,0 +1,41 @@ +# Football demo + +Real-time football broadcast overlay: **detection → tracking → overlay** +(`pyml_yolo`/`pyml_objectdetector` -> `pyml_tracker` -> `pyml_football_overlay`). + +The overlay draws a foot ellipse per player coloured by team (red/blue, voted +from jersey hue), a gold ellipse for referees, motion trails (off by default), +and a focal-player HUD with headshot, ball contacts, and distance travelled. +Players whose team isn't decided yet (and unclassifiable kits, e.g. the +goalkeeper) are left unmarked rather than drawn in a placeholder colour. The +ball is tracked for contact counting but its marker is off by default. + +## Run + +```bash +# file -> annotated MP4 +demo/football/run.sh +demo/football/run.sh 08fd33_4.mp4 demo/football/out.mp4 1280x720 + +# file -> live on-screen +demo/football/run.sh display +demo/football/run.sh display 08fd33_4.mp4 1280x720 + +# live camera -> on-screen +demo/football/run.sh camera /dev/video0 +``` + +## Environment knobs + +| Var | Default | Meaning | +|------------|---------|---------| +| `BACKEND` | `pt` | `pt` = PyTorch `pyml_yolo`; `fp16` = ONNX FP16 via `pyml_objectdetector` (CUDA). | +| `INTERVAL` | `3` | Run detection every Nth frame; the tracker/overlay still update every frame, so it stays smooth at ~N× less inference cost. The main real-time lever. | + +```bash +BACKEND=fp16 demo/football/run.sh display # faster inference path +INTERVAL=5 demo/football/run.sh display # detect every 5th frame +INTERVAL=1 demo/football/run.sh # detect every frame (max accuracy) +``` + + diff --git a/demo/football/onnx_loop.py b/demo/football/onnx_loop.py new file mode 100644 index 0000000..bbe9540 --- /dev/null +++ b/demo/football/onnx_loop.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 +# Run a video through the ONNX (fp16) football pipeline. +# +# detector (onnx) -> pyml_tracker -> pyml_football_overlay +# +# Usage: +# python demo/football/onnx_loop.py INPUT.mp4 # live display, looping +# python demo/football/onnx_loop.py INPUT.mp4 OUTPUT.mp4 # write annotated mp4 +# (self-contained: finds the repo venv + plugins and re-execs into them) +import os +import sys +import glob + +REPO = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) +VENV = os.path.join(REPO, ".venv") +MODEL = os.path.join(REPO, "models/football/football_fp16.onnx") +os.environ["GST_PLUGIN_PATH"] = ( + os.path.join(REPO, "plugins") + os.pathsep + os.environ.get("GST_PLUGIN_PATH", "") +) +if not os.environ.get("_ONNX_LOOP_REEXEC") and os.path.isdir(VENV): + os.environ["VIRTUAL_ENV"] = VENV + os.environ["PATH"] = ( + os.path.join(VENV, "bin") + os.pathsep + os.environ.get("PATH", "") + ) + libs = sorted( + set( + glob.glob( + os.path.join( + VENV, "lib", "python*", "site-packages", "nvidia", "*", "lib" + ) + ) + ) + ) + if libs: + os.environ["LD_LIBRARY_PATH"] = os.pathsep.join( + [*libs, os.environ.get("LD_LIBRARY_PATH", "")] + ) + os.environ["_ONNX_LOOP_REEXEC"] = "1" + pybin = os.path.join(VENV, "bin", "python") + exe = pybin if os.path.exists(pybin) else sys.executable + os.execv(exe, [exe, *sys.argv]) + +import gi # noqa: E402 + +gi.require_version("Gst", "1.0") +from gi.repository import Gst, GLib # noqa: E402 + +Gst.init(None) + + +def on_message(bus, message, loop, pipeline, do_loop): + t = message.type + + if t == Gst.MessageType.EOS: + if do_loop: + # Display mode: seek back to the start to loop the clip. + print("Looping...") + if not pipeline.seek_simple( + Gst.Format.TIME, Gst.SeekFlags.FLUSH | Gst.SeekFlags.KEY_UNIT, 0 + ): + print("Failed to seek back to start", file=sys.stderr) + loop.quit() + else: + # mp4 mode: end of file, the muxer has finalized the file. + loop.quit() + + elif t == Gst.MessageType.ERROR: + err, debug = message.parse_error() + print(f"ERROR: {err}", file=sys.stderr) + if debug: + print(f"DEBUG: {debug}", file=sys.stderr) + loop.quit() + + +def main(): + if len(sys.argv) < 2: + print(f"usage: {sys.argv[0]} INPUT.mp4 [OUTPUT.mp4]", file=sys.stderr) + print( + " no OUTPUT -> live display (looping); OUTPUT -> write annotated mp4", + file=sys.stderr, + ) + sys.exit(1) + video = os.path.abspath(sys.argv[1]) + out = os.path.abspath(sys.argv[2]) if len(sys.argv) > 2 else None + + # Shared detection + overlay chain. Feed the ORIGINAL resolution: + # pyml_objectdetector letterboxes to the model's 640 internally for + # inference and maps boxes back, so the overlay stays full-res. + chain = ( + f"filesrc location={video} ! " + "decodebin ! videoconvert ! video/x-raw,format=RGB ! " + "queue max-size-buffers=8 max-size-time=0 max-size-bytes=0 ! " + "pyml_objectdetector engine-name=onnx " + f" model-name={MODEL} device=cuda:0 " + " input-format=nchw post-process=anchor_free interval=1 " + " confidence=0.1 nms-iou=0.7 ! " + "queue max-size-buffers=8 max-size-time=0 max-size-bytes=0 ! " + "pyml_tracker tracker-type=bytetrack new-track-confidence=0.25 ! " + "videoconvert ! video/x-raw,format=RGBA ! " + "queue max-size-buffers=8 max-size-time=0 max-size-bytes=0 ! " + "pyml_football_overlay class-names=ball,goalkeeper,player,referee " + " team-colors=true trails=false show-ids=false show-labels=false " + " draw-from-detections=true min-confidence=0 merge-iou=0.5 " + " position-smoothing=0.7 highlight-focal=false ! " + ) + if out: + pipeline_description = ( + chain + "queue max-size-buffers=8 max-size-time=0 max-size-bytes=0 ! " + "videoconvert ! openh264enc ! h264parse ! mp4mux ! " + f"filesink location={out}" + ) + do_loop = False + else: + # Pre-roll buffer absorbs inference jitter for smooth real-time display. + pipeline_description = ( + chain + "queue max-size-buffers=600 max-size-time=0 max-size-bytes=0 " + " min-threshold-buffers=30 ! " + "videoconvert ! autovideosink sync=true" + ) + do_loop = True + + print(pipeline_description) + print(f"writing -> {out}" if out else "live display (looping)") + + try: + pipeline = Gst.parse_launch(pipeline_description) + except GLib.Error as e: + print(f"Failed to create pipeline: {e}", file=sys.stderr) + sys.exit(1) + + loop = GLib.MainLoop() + + bus = pipeline.get_bus() + bus.add_signal_watch() + bus.connect("message", on_message, loop, pipeline, do_loop) + + pipeline.set_state(Gst.State.PLAYING) + + try: + loop.run() + except KeyboardInterrupt: + if out: + # Finalize the mp4 on Ctrl-C: send EOS and wait for the muxer to + # flush its trailer, otherwise the file is left unplayable. + pipeline.send_event(Gst.Event.new_eos()) + bus.timed_pop_filtered( + 5 * Gst.SECOND, Gst.MessageType.EOS | Gst.MessageType.ERROR + ) + finally: + pipeline.set_state(Gst.State.NULL) + if out: + print(f"Done: {out}") + + +if __name__ == "__main__": + main() diff --git a/demo/football/run.sh b/demo/football/run.sh new file mode 100755 index 0000000..0be9a15 --- /dev/null +++ b/demo/football/run.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash +# Football broadcast-overlay demo. +# +# Ppipeline: +# detector -> pyml_tracker (ByteTrack) -> pyml_football_overlay +# +# Usage: +# demo/football/run.sh [INPUT.mp4] [OUTPUT.mp4] [WxH] # file -> annotated mp4 +# demo/football/run.sh display [INPUT.mp4] [WxH] # file -> live on-screen +# demo/football/run.sh camera [/dev/videoN] [WxH] # live camera -> on-screen +set -euo pipefail + +REPO="$(cd "$(dirname "$0")/../.." && pwd)" +cd "$REPO" +source .venv/bin/activate +export GST_PLUGIN_PATH="$REPO/plugins:${GST_PLUGIN_PATH:-}" + +BACKEND="${BACKEND:-pt}" +INTERVAL="${INTERVAL:-3}" # run detection every Nth frame; tracker/overlay stay per-frame +CONF="${CONF:-0.1}" # detector confidence threshold (low = more detections) +IOU="${IOU:-0.7}" # NMS IoU (ultralytics/football_analyzer default) +NEWTRACK="${NEWTRACK:-0.25}" # min confidence to START a new track (ByteTrack gate; kills ghosts) +DRAWCONF="${DRAWCONF:-0}" # min confidence to DRAW a detection (0 = draw all; raise to trim weak boxes) +MERGE="${MERGE:-0.5}" # collapse overlapping boxes (lower=merge more; 0 disables) so one player=one circle +SMOOTH="${SMOOTH:-0.6}" # temporal EMA on circle positions (0=off, higher=smoother but more lag) +CLASSES="ball,goalkeeper,player,referee" +TRACK="pyml_tracker tracker-type=bytetrack new-track-confidence=$NEWTRACK" +# Detection-based overlay: circles sit on the raw per-frame detections (no +# tracking drift/phantoms/doubles); merge collapses overlaps and +# position-smoothing low-passes the positions. DRAWCONF defaults 0 so no +# detection is hidden; the tracker still runs so the HUD keeps its stats. +OVERLAY="pyml_football_overlay class-names=$CLASSES team-colors=true trails=false show-ids=false show-labels=false draw-from-detections=true min-confidence=$DRAWCONF merge-iou=$MERGE position-smoothing=$SMOOTH highlight-focal=false" + +if [[ "$BACKEND" == "fp16" ]]; then + export LD_LIBRARY_PATH="$(python -c "import os,nvidia,glob;b=os.path.dirname(nvidia.__file__);print(':'.join(sorted(set(glob.glob(b+'/*/lib')))))"):${LD_LIBRARY_PATH:-}" + DETECT="pyml_objectdetector engine-name=onnx model-name=models/football/football_fp16.onnx device=cuda:0 input-format=nchw post-process=anchor_free interval=$INTERVAL" + IN_FMT="RGB"; FORCE_SQUARE=1 +else + DETECT="pyml_yolo model-name=models/football/football device=cuda:0 interval=$INTERVAL confidence=$CONF nms-iou=$IOU" + IN_FMT="RGBA"; FORCE_SQUARE=0 +fi + +POST_DETECT="$TRACK" +[[ "$IN_FMT" == "RGB" ]] && POST_DETECT="$TRACK ! videoconvert ! video/x-raw,format=RGBA" + +# A queue at each stage boundary turns the serial chain into a threaded +# pipeline: while inference runs on frame N, the sink renders N-1 and the +# decoder reads N+1. Nothing is dropped (leaky=no, the default). +Q="queue max-size-buffers=8 max-size-time=0 max-size-bytes=0" +# Pre-roll buffer before the display sink: build a head start of processed +# frames so real-time playback (sync=true) rides out per-frame inference +# jitter without stuttering. Smooths jitter, not a sustained throughput +# deficit -- if inference can't keep up on average, playback just lags +# (still no drops). Lower INTERVAL/raise the head start if it falls behind. +PREROLL="queue max-size-buffers=600 max-size-time=0 max-size-bytes=0 min-threshold-buffers=30" + +# detector -> tracker -> overlay, with a thread boundary at each hop. +CHAIN="$Q ! $DETECT ! $Q ! $POST_DETECT ! $Q ! $OVERLAY" + +MODE="${1:-file}" +if [[ "$MODE" == "camera" ]]; then + DEV="${2:-/dev/video0}"; SIZE="${3:-1280x720}" + [[ "$FORCE_SQUARE" == "1" ]] && SIZE="640x640" + W="${SIZE%x*}"; H="${SIZE#*x}" + echo "[$BACKEND] live camera $DEV @ ${W}x${H} -> autovideosink (needs a display)" + exec gst-launch-1.0 -e \ + v4l2src device="$DEV" ! videoconvert ! videoscale \ + ! "video/x-raw,width=${W},height=${H},format=${IN_FMT}" \ + ! $CHAIN \ + ! $Q ! videoconvert ! autovideosink sync=false +elif [[ "$MODE" == "display" ]]; then + IN="${2:-data/soccer_tracking.mp4}" + SIZE="${3:-1280x720}" + [[ "$FORCE_SQUARE" == "1" ]] && SIZE="640x640" + W="${SIZE%x*}"; H="${SIZE#*x}" + [[ -f "$IN" ]] || { echo "input not found: $IN" >&2; exit 1; } + echo "[$BACKEND] '$IN' @ ${W}x${H} -> live display (real-time, sync=true)" + exec gst-launch-1.0 -e \ + filesrc location="$IN" ! decodebin ! videoconvert ! videoscale \ + ! "video/x-raw,width=${W},height=${H},format=${IN_FMT}" \ + ! $CHAIN \ + ! $PREROLL ! videoconvert ! autovideosink sync=true +else + IN="${1:-data/soccer_tracking.mp4}" + OUT="${2:-demo/football/out.mp4}" + SIZE="${3:-1280x720}" + [[ "$FORCE_SQUARE" == "1" ]] && SIZE="640x640" + W="${SIZE%x*}"; H="${SIZE#*x}" + [[ -f "$IN" ]] || { echo "input not found: $IN" >&2; exit 1; } + echo "[$BACKEND] '$IN' @ ${W}x${H} -> '$OUT'" + gst-launch-1.0 -e \ + filesrc location="$IN" ! decodebin ! videoconvert ! videoscale \ + ! "video/x-raw,width=${W},height=${H},format=${IN_FMT}" \ + ! $CHAIN \ + ! $Q ! videoconvert ! openh264enc ! h264parse ! mp4mux ! filesink location="$OUT" + echo "Done: $OUT" +fi diff --git a/models/football/football.onnx b/models/football/football.onnx new file mode 100644 index 0000000..0d742a2 --- /dev/null +++ b/models/football/football.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80c093f8f67e866232f3e31e71809071cf4f6c97914ab7c0cd82cbb8d6e30dfb +size 101508645 diff --git a/models/football/football.pt b/models/football/football.pt new file mode 100644 index 0000000..e1fa8fb --- /dev/null +++ b/models/football/football.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffd531b4739e544b075479d6a41118931e82f8362d576258218e8fab2e4bdfa9 +size 51178706 diff --git a/models/football/football_fp16.onnx b/models/football/football_fp16.onnx new file mode 100644 index 0000000..6f15606 --- /dev/null +++ b/models/football/football_fp16.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e6589a1567088115f8e84564e938e09938f152b4b42902e525359bef601e350 +size 50859790 diff --git a/models/football/football_int8.onnx b/models/football/football_int8.onnx new file mode 100644 index 0000000..4dc0e6f --- /dev/null +++ b/models/football/football_int8.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af48014db347b8821efd7d107819cec2959aeb99cbb99c272e6ea9e4bd938519 +size 30817706 diff --git a/plugins/python/base_objectdetector.py b/plugins/python/base_objectdetector.py index 84f2f2d..5b14f02 100644 --- a/plugins/python/base_objectdetector.py +++ b/plugins/python/base_objectdetector.py @@ -46,6 +46,11 @@ def __init__(self): self.metadata = Metadata("si") self.logger.info("Initialized BaseObjectDetector") self.__track = False + self.__interval = 1 + self._det_counter = 0 + self._cached_results = None + self._cached_num_sources = 1 + self._cached_id = None @GObject.Property(type=bool, default=False) def track(self): @@ -60,6 +65,17 @@ def track(self, value): if self.engine: self.engine.track = value + @GObject.Property(type=int, default=1, minimum=1, maximum=10000) + def interval(self): + "Run detection every Nth frame and re-attach the previous detections on " + "the frames in between (N=1 runs detection every frame). Lets downstream " + "tracking/overlay stay per-frame while detection runs at a lower rate." + return self.__interval + + @interval.setter + def interval(self, value): + self.__interval = max(1, int(value)) + def do_forward(self, frames): self.logger.info( f"Forward called with frames shape: {frames.shape if frames is not None else 'None'}" @@ -77,47 +93,39 @@ def do_transform_ip(self, buf): """ self.logger.info(f"Transforming buffer: {hex(id(buf))}") try: - # Use MuxedBufferProcessor to extract frames and metadata - muxed_processor = MuxedBufferProcessor( - self.logger, - self.width, - self.height, - self.framerate_num, - self.framerate_denom, - ) - frames, id_str, num_sources, format = muxed_processor.extract_frames( - buf, self.sinkpad - ) - if frames is None: - self.logger.error("Failed to extract frames") - return Gst.FlowReturn.ERROR - - # Process frames (single or batch) - results = self.do_forward(frames) - if results is None: - self.logger.error("Inference returned None") - return Gst.FlowReturn.ERROR - - # Handle single-frame case - if num_sources == 1: - self.do_decode(buf, results, stream_idx=0) - # Handle batch case - else: - self.logger.info( - f"Processing batch with ID={id_str}, num_sources={num_sources}" + run_detect = (self._det_counter % self.__interval) == 0 + self._det_counter += 1 + + if run_detect: + # Use MuxedBufferProcessor to extract frames and metadata + muxed_processor = MuxedBufferProcessor( + self.logger, + self.width, + self.height, + self.framerate_num, + self.framerate_denom, + ) + frames, id_str, num_sources, format = muxed_processor.extract_frames( + buf, self.sinkpad ) - results_list = results if isinstance(results, list) else [results] - if len(results_list) != num_sources: - self.logger.error( - f"Expected {num_sources} results, got {len(results_list)}" - ) + if frames is None: + self.logger.error("Failed to extract frames") return Gst.FlowReturn.ERROR - for idx, result in enumerate(results_list): - if result is None: - self.logger.warning(f"Frame {idx} result is None") - continue - self.do_decode(buf, result, stream_idx=idx) + results = self.do_forward(frames) + if results is None: + self.logger.error("Inference returned None") + return Gst.FlowReturn.ERROR + + self._cached_results = results + self._cached_num_sources = num_sources + self._decode_results(buf, results, num_sources) + elif self._cached_results is not None: + # Skip inference on this frame and re-attach the previous + # detections so downstream tracking/overlay stay per-frame. + self._decode_results( + buf, self._cached_results, self._cached_num_sources + ) attached_meta = GstAnalytics.buffer_get_analytics_relation_meta(buf) if attached_meta: @@ -132,6 +140,22 @@ def do_transform_ip(self, buf): self.logger.error(f"Transform error: {e}\n{traceback.format_exc()}") return Gst.FlowReturn.ERROR + def _decode_results(self, buf, results, num_sources): + if num_sources == 1: + self.do_decode(buf, results, stream_idx=0) + else: + results_list = results if isinstance(results, list) else [results] + if len(results_list) != num_sources: + self.logger.error( + f"Expected {num_sources} results, got {len(results_list)}" + ) + return + for idx, result in enumerate(results_list): + if result is None: + self.logger.warning(f"Frame {idx} result is None") + continue + self.do_decode(buf, result, stream_idx=idx) + def do_decode(self, buf, output, stream_idx=0): self.logger.info( f"Decoding for stream {stream_idx}: {output} (type: {type(output)})" diff --git a/plugins/python/engine/drpai_engine.py b/plugins/python/engine/drpai_engine.py new file mode 100644 index 0000000..20c0ca9 --- /dev/null +++ b/plugins/python/engine/drpai_engine.py @@ -0,0 +1,145 @@ +# DRPAIEngine +# Copyright (C) 2024-2026 Collabora Ltd. +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Library General Public +# License as published by the Free Software Foundation; either +# version 2 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Library General Public License for more details. +# +# You should have received a copy of the GNU Library General Public +# License along with this library; if not, write to the +# Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, +# Boston, MA 02110-1301, USA. + +import os +import numpy as np + +from .ml_engine import MLEngine + + +def _anchor_count(imgsz): + """Total anchors a YOLO model emits for a square input at strides 8/16/32.""" + return sum((imgsz // s) ** 2 for s in (8, 16, 32)) + + +class DRPAIEngine(MLEngine): + """DRP-AI TVM runtime engine for Renesas RZ/V boards (RZ/V2H). + + Runs a model compiled with the Renesas DRP-AI TVM compiler on the DRP-AI + NPU. `model_name` is the path to the compiled deploy directory containing + ``deploy.so`` / ``deploy.json`` / ``deploy.params``. + + Inference goes through the ``drpai_runtime`` pybind11 module (built from + ``rzv2h/`` against the board's DRP-AI TVM runtime. + """ + + def __init__(self): + super().__init__() + self.runtime = None + self.model_name = None + self.kwargs = None + self.imgsz = 640 + + self.input_format = "nchw" + self.post_process = "anchor_free" + + def do_load_model(self, model_name, **kwargs): + self.model_name = model_name + self.kwargs = kwargs + imgsz = kwargs.get("imgsz") + if imgsz: + try: + self.imgsz = int(imgsz) + except (TypeError, ValueError): + pass + + try: + import drpai_runtime + except ImportError as e: + self.logger.error( + "drpai_runtime module not found. Build the pybind11 binding in " + "rzv2h/ inside the RZ/V2H DRP-AI TVM SDK and put it on PYTHONPATH " + f"(see rzv2h/README.md). Import error: {e}" + ) + return False + + if not os.path.isdir(model_name): + self.logger.error( + f"DRP-AI model directory not found: {model_name!r} " + "(expected a folder with deploy.so/json/params)" + ) + return False + + try: + self.runtime = drpai_runtime.Runtime() + if not self.runtime.load(model_name): + self.logger.error(f"DRP-AI failed to load model from {model_name}") + self.runtime = None + return False + self.logger.info( + f"DRP-AI model loaded from {model_name} (imgsz={self.imgsz})" + ) + return True + except Exception as e: + self.logger.error(f"DRP-AI load error: {e}") + self.runtime = None + return False + + def do_set_device(self, device): + self.device = device + self.logger.info(f"DRP-AI engine device set to {device}") + + def do_generate(self, input_text, max_length=1000, system_prompt=None): + raise NotImplementedError( + "DRP-AI engine is a vision-inference engine; text generation is not " + "supported." + ) + + def _preprocess(self, frame_hwc): + """HWC uint8 RGB(A) frame -> contiguous (1, 3, H, W) float32 in [0, 1].""" + x = np.asarray(frame_hwc, dtype=np.float32) + if x.shape[-1] > 3: + x = x[..., :3] + x = x / 255.0 + x = np.transpose(x, (2, 0, 1)) + x = np.expand_dims(x, 0) + return np.ascontiguousarray(x, dtype=np.float32) + + def _gather_output(self): + """Read output 0 and reshape the flat buffer to (1, 4+nc, anchors).""" + out = np.asarray(self.runtime.get_output(0), dtype=np.float32).reshape(-1) + anchors = _anchor_count(self.imgsz) + if anchors and out.size % anchors == 0: + channels = out.size // anchors + return out.reshape(1, channels, anchors) + self.logger.warning( + f"DRP-AI output size {out.size} not divisible by {anchors} anchors; " + "passing raw to post-process" + ) + return out + + def do_forward(self, frames): + if self.runtime is None: + self.logger.error("DRP-AI runtime not loaded") + return None + + is_batch = isinstance(frames, np.ndarray) and frames.ndim == 4 + batch = frames if is_batch else frames[np.newaxis, ...] + + results = [] + for img in batch: + try: + self.runtime.set_input(0, self._preprocess(img)) + self.runtime.run() + raw = self._gather_output() + results.append(self._apply_post_process(raw, is_batch=False)) + except Exception as e: + self.logger.error(f"DRP-AI inference error: {e}") + results.append(None) + + return results if is_batch else results[0] diff --git a/plugins/python/engine/engine_factory.py b/plugins/python/engine/engine_factory.py index 2a0e5fb..d361bdf 100644 --- a/plugins/python/engine/engine_factory.py +++ b/plugins/python/engine/engine_factory.py @@ -44,6 +44,7 @@ class EngineFactory: MIGRAPHX_ENGINE = "migraphx" IREE_ENGINE = "iree" NCNN_ENGINE = "ncnn" + DRPAI_ENGINE = "drpai" _builtins_registered: bool = False # Class-level flag for singleton-like lazy init @@ -154,6 +155,13 @@ def _register_builtins(cls) -> None: except ImportError: pass + try: + from .drpai_engine import DRPAIEngine + + _try_register(cls.DRPAI_ENGINE, DRPAIEngine) + except ImportError: + pass + @staticmethod def register(engine_type: str, engine_class: Type) -> None: _engine_registry[engine_type] = engine_class diff --git a/plugins/python/engine/ml_engine.py b/plugins/python/engine/ml_engine.py index 3d5a74c..58f2e9d 100644 --- a/plugins/python/engine/ml_engine.py +++ b/plugins/python/engine/ml_engine.py @@ -96,7 +96,12 @@ def _apply_post_process(self, raw, is_batch): if pp != "none" and not isinstance(raw, list): from utils.detection_decoder import decode - results = decode(raw, pp) + results = decode( + raw, + pp, + conf_threshold=getattr(self, "conf", 0.25), + iou_threshold=getattr(self, "iou", 0.45), + ) return results[0] if not is_batch else results return raw diff --git a/plugins/python/engine/onnx_engine.py b/plugins/python/engine/onnx_engine.py index ba85e84..ad3dce7 100644 --- a/plugins/python/engine/onnx_engine.py +++ b/plugins/python/engine/onnx_engine.py @@ -49,6 +49,62 @@ def _input_is_nchw(self): shape = self.session.get_inputs()[0].shape return len(shape) == 4 and shape[1] in (1, 3, 4) + def _model_input_hw(self): + """(H, W) the model's input expects, or None if dynamic/unknown.""" + if self.session is None: + return None + shape = self.session.get_inputs()[0].shape + if len(shape) != 4: + return None + h, w = shape[2], shape[3] + if isinstance(h, int) and isinstance(w, int) and h > 0 and w > 0: + return (h, w) + return None + + def _letterbox(self, frames, is_batch): + """Resize frame(s) to the model input size, preserving aspect ratio with + grey padding (YOLO-style). Returns (processed, transform); transform = + (ratio, pad_x, pad_y, orig_w, orig_h) maps model coords back to the + original frame. Returns (frames, None) when no resize is needed (already + model-sized, or dynamic input) -- so pre-sized callers are unaffected.""" + import numpy as np + import cv2 + + mhw = self._model_input_hw() + if mhw is None: + return frames, None + mh, mw = mhw + imgs = frames if is_batch else frames[None] + h, w = int(imgs.shape[1]), int(imgs.shape[2]) + if (h, w) == (mh, mw): + return frames, None + r = min(mh / h, mw / w) + nh, nw = int(round(h * r)), int(round(w * r)) + pad_x, pad_y = (mw - nw) // 2, (mh - nh) // 2 + out = np.full((imgs.shape[0], mh, mw, imgs.shape[3]), 114, dtype=imgs.dtype) + for i in range(imgs.shape[0]): + out[i, pad_y : pad_y + nh, pad_x : pad_x + nw] = cv2.resize( + imgs[i], (nw, nh), interpolation=cv2.INTER_LINEAR + ) + proc = out if is_batch else out[0] + return proc, (r, float(pad_x), float(pad_y), w, h) + + def _unletterbox(self, results, transform): + """Map detection boxes from model coords back to original-frame coords.""" + import numpy as np + + r, pad_x, pad_y, ow, oh = transform + for res in results if isinstance(results, list) else [results]: + if not isinstance(res, dict): + continue + b = res.get("boxes") + if b is None or len(b) == 0: + continue + b = np.asarray(b, dtype=np.float32).copy() + b[:, [0, 2]] = ((b[:, [0, 2]] - pad_x) / r).clip(0, ow) + b[:, [1, 3]] = ((b[:, [1, 3]] - pad_y) / r).clip(0, oh) + res["boxes"] = b + def do_load_model(self, model_name, **kwargs): """Load a pre-trained model by name from TorchVision, Transformers (via Optimum ONNX), or a local ONNX path.""" processor_name = kwargs.get("processor_name") @@ -369,10 +425,21 @@ def do_forward(self, frames): fmt = self.input_format if fmt == "auto" and self._input_is_nchw(): self.input_format = "nchw" - img = self._apply_input_format(frames.astype(np.float32) / 255.0, is_batch) + # Letterbox to the model's fixed input size for inference, keeping + # the transform so boxes map back to the original frame -- lets the + # caller feed full-res frames and overlay on them. + proc, transform = self._letterbox(frames, is_batch) + img = self._apply_input_format(proc.astype(np.float32) / 255.0, is_batch) + if "float16" in self.session.get_inputs()[0].type: + img = img.astype(np.float16) outputs = self.session.run(self.output_names, {self.input_names[0]: img}) raw = outputs if len(outputs) > 1 else outputs[0] - return self._apply_post_process(raw, is_batch) + if isinstance(raw, np.ndarray) and raw.dtype != np.float32: + raw = raw.astype(np.float32) + results = self._apply_post_process(raw, is_batch) + if transform is not None: + self._unletterbox(results, transform) + return results else: raise ValueError("Unsupported model type.") diff --git a/plugins/python/football_analyzer.py b/plugins/python/football_analyzer.py new file mode 100644 index 0000000..545be61 --- /dev/null +++ b/plugins/python/football_analyzer.py @@ -0,0 +1,906 @@ +# FootballAnalyzer +# Copyright (C) 2024-2026 Collabora Ltd. +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Library General Public +# License as published by the Free Software Foundation; either +# version 2 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Library General Public License for more details. +# +# You should have received a copy of the GNU Library General Public +# License along with this library; if not, write to the +# Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, +# Boston, MA 02110-1301, USA. + +import os +import pickle + +from log.global_logger import GlobalLogger + +CAN_REGISTER_ELEMENT = True +try: + import gi + + gi.require_version("Gst", "1.0") + gi.require_version("GstBase", "1.0") + gi.require_version("GstVideo", "1.0") + from gi.repository import Gst, GstBase, GstVideo, GObject # noqa: E402 + + # Define caps before the optional heavy imports so the element's pad + # templates still resolve when an optional dep (e.g. supervision) is missing; + # only registration is then skipped (CAN_REGISTER_ELEMENT=False). + VIDEO_CAPS = Gst.Caps.from_string("video/x-raw, format=BGR") + + import cv2 + import numpy as np + import supervision as sv + from ultralytics import YOLO + + from log.logger_factory import LoggerFactory # noqa: E402 + +except ImportError as e: + CAN_REGISTER_ELEMENT = False + GlobalLogger().warning( + f"The 'pyml_football_analyzer' element will not be available. Error: {e}" + ) + + +def get_center_of_bbox(bbox): + x1, y1, x2, y2 = bbox + return int((x1 + x2) / 2), int((y1 + y2) / 2) + + +def get_bbox_width(bbox): + return bbox[2] - bbox[0] + + +class Tracker: + """Tracker.""" + + def __init__(self, model_path): + self.model = YOLO(model_path) + self.tracker = sv.ByteTrack() + self.sift = cv2.SIFT_create() + self.matcher = cv2.BFMatcher(cv2.NORM_L2) + + def _foreground_mask(self, shape, frame_tracks, dilation=15): + h, w = shape[:2] + mask = np.full((h, w), 255, dtype=np.uint8) + bboxes = [] + for key in ("players", "referees", "ball"): + for obj in frame_tracks.get(key, {}).values(): + bboxes.append(obj["bbox"]) + for bbox in bboxes: + x1 = max(0, int(bbox[0]) - dilation) + y1 = max(0, int(bbox[1]) - dilation) + x2 = min(w, int(bbox[2]) + dilation) + y2 = min(h, int(bbox[3]) + dilation) + mask[y1:y2, x1:x2] = 0 + return mask + + def get_camera_motion( + self, + frames, + tracks, + read_from_stub=False, + stub_path=None, + ratio=0.75, + ransac_thresh=3.0, + min_matches=8, + ): + if read_from_stub and stub_path is not None and os.path.exists(stub_path): + with open(stub_path, "rb") as f: + return pickle.load(f) + + cumulative = [np.eye(3, dtype=np.float64)] + prev_gray = cv2.cvtColor(frames[0], cv2.COLOR_BGR2GRAY) + prev_mask = self._foreground_mask( + frames[0].shape, {k: tracks[k][0] for k in tracks} + ) + prev_kp, prev_desc = self.sift.detectAndCompute(prev_gray, prev_mask) + + for i in range(1, len(frames)): + curr_gray = cv2.cvtColor(frames[i], cv2.COLOR_BGR2GRAY) + curr_mask = self._foreground_mask( + frames[i].shape, {k: tracks[k][i] for k in tracks} + ) + curr_kp, curr_desc = self.sift.detectAndCompute(curr_gray, curr_mask) + + H_step = np.eye(3, dtype=np.float64) + if ( + prev_desc is not None + and curr_desc is not None + and len(prev_desc) >= 2 + and len(curr_desc) >= 2 + ): + knn = self.matcher.knnMatch(prev_desc, curr_desc, k=2) + good = [ + m + for pair in knn + if len(pair) == 2 + for m, n in [pair] + if m.distance < ratio * n.distance + ] + if len(good) >= min_matches: + pts_prev = np.float32( + [prev_kp[m.queryIdx].pt for m in good] + ).reshape(-1, 1, 2) + pts_curr = np.float32( + [curr_kp[m.trainIdx].pt for m in good] + ).reshape(-1, 1, 2) + H, _ = cv2.findHomography( + pts_prev, pts_curr, cv2.RANSAC, ransac_thresh + ) + if H is not None: + H_step = H + + cumulative.append(H_step @ cumulative[-1]) + prev_kp, prev_desc = curr_kp, curr_desc + + if stub_path is not None: + with open(stub_path, "wb") as f: + pickle.dump(cumulative, f) + return cumulative + + def detect_frames(self, frames): + batch_size = 20 + detections = [] + for i in range(0, len(frames), batch_size): + detections_batch = self.model.predict(frames[i : i + batch_size], conf=0.1) + detections += detections_batch + return detections + + def get_object_tracks(self, frames, read_from_stub=False, stub_path=None): + if read_from_stub and stub_path is not None and os.path.exists(stub_path): + with open(stub_path, "rb") as f: + tracks = pickle.load(f) + return tracks + + detections = self.detect_frames(frames) + tracks = {"players": [], "referees": [], "ball": []} + + per_frame = [] + class_votes = {} + for detection in detections: + cls_names = detection.names + cls_names_inv = {v: k for k, v in cls_names.items()} + + detection_supervision = sv.Detections.from_ultralytics(detection) + + for object_ind, class_id in enumerate(detection_supervision.class_id): + if cls_names[class_id] == "goalkeeper": + detection_supervision.class_id[object_ind] = cls_names_inv["player"] + + tracked = self.tracker.update_with_detections(detection_supervision) + per_frame.append((tracked, detection_supervision, cls_names, cls_names_inv)) + + for fd in tracked: + cls_name = cls_names[fd[3]] + track_id = fd[4] + if cls_name in ("player", "referee"): + v = class_votes.setdefault(track_id, {"player": 0, "referee": 0}) + v[cls_name] += 1 + + track_class = { + tid: ("player" if v["player"] >= v["referee"] else "referee") + for tid, v in class_votes.items() + } + + for frame_num, (tracked, raw_detections, cls_names, cls_names_inv) in enumerate( + per_frame + ): + tracks["players"].append({}) + tracks["referees"].append({}) + tracks["ball"].append({}) + + for fd in tracked: + bbox = fd[0].tolist() + track_id = fd[4] + stable_cls = track_class.get(track_id) + if stable_cls == "player": + tracks["players"][frame_num][track_id] = {"bbox": bbox} + elif stable_cls == "referee": + tracks["referees"][frame_num][track_id] = {"bbox": bbox} + + for fd in raw_detections: + if fd[3] == cls_names_inv["ball"]: + tracks["ball"][frame_num][1] = {"bbox": fd[0].tolist()} + + if stub_path is not None: + with open(stub_path, "wb") as f: + pickle.dump(tracks, f) + + return tracks + + def draw_ellipse(self, frame, bbox, color, track_id=None): + y2 = int(bbox[3]) + x_center, _ = get_center_of_bbox(bbox) + width = get_bbox_width(bbox) + + cv2.ellipse( + frame, + center=(x_center, y2), + axes=(int(width), int(0.35 * width)), + angle=0.0, + startAngle=-45, + endAngle=235, + color=color, + thickness=2, + lineType=cv2.LINE_4, + ) + + rectangle_width = 40 + rectangle_height = 20 + x1_rect = x_center - rectangle_width // 2 + x2_rect = x_center + rectangle_width // 2 + y1_rect = (y2 - rectangle_height // 2) + 15 + y2_rect = (y2 + rectangle_height // 2) + 15 + + if track_id is not None: + cv2.rectangle( + frame, + (int(x1_rect), int(y1_rect)), + (int(x2_rect), int(y2_rect)), + color, + cv2.FILLED, + ) + x1_text = x1_rect + 12 + if track_id > 99: + x1_text -= 10 + cv2.putText( + frame, + f"{track_id}", + (int(x1_text), int(y1_rect + 15)), + cv2.FONT_HERSHEY_SIMPLEX, + 0.6, + (0, 0, 0), + 2, + ) + return frame + + def draw_traingle(self, frame, bbox, color): + y = int(bbox[1]) + x, _ = get_center_of_bbox(bbox) + triangle_points = np.array( + [ + [x, y], + [x - 10, y - 20], + [x + 10, y - 20], + ] + ) + cv2.drawContours(frame, [triangle_points], 0, color, cv2.FILLED) + cv2.drawContours(frame, [triangle_points], 0, (0, 0, 0), 2) + return frame + + def classify_jersey(self, frame, bbox): + x1, y1, x2, y2 = [int(v) for v in bbox] + h_box, w_box = y2 - y1, x2 - x1 + if h_box <= 0 or w_box <= 0: + return None + jy1 = y1 + int(0.15 * h_box) + jy2 = y1 + int(0.55 * h_box) + jx1 = x1 + int(0.25 * w_box) + jx2 = x1 + int(0.75 * w_box) + H, W = frame.shape[:2] + jy1, jy2 = max(0, jy1), min(H, jy2) + jx1, jx2 = max(0, jx1), min(W, jx2) + if jy2 - jy1 < 3 or jx2 - jx1 < 3: + return None + patch = frame[jy1:jy2, jx1:jx2] + hsv = cv2.cvtColor(patch, cv2.COLOR_BGR2HSV) + s_v = (hsv[..., 1] > 80) & (hsv[..., 2] > 50) + h = hsv[..., 0] + red = (((h <= 10) | (h >= 170)) & s_v).sum() + blue = ((h >= 100) & (h <= 130) & s_v).sum() + min_pixels = max(20, int(0.02 * patch.shape[0] * patch.shape[1])) + if red < min_pixels and blue < min_pixels: + return None + return "red" if red >= blue else "blue" + + def _ref_bottom_center(self, bbox, H_inv): + xc, _ = get_center_of_bbox(bbox) + yb = int(bbox[3]) + pt = cv2.perspectiveTransform(np.array([[[xc, yb]]], dtype=np.float32), H_inv)[ + 0 + ][0] + return float(pt[0]), float(pt[1]) + + def _minimap_extent(self, tracks, camera_motion): + xs, ys = [], [] + n = len(tracks["players"]) + for i in range(n): + H_inv = ( + np.linalg.inv(camera_motion[i]) + if camera_motion is not None + else np.eye(3) + ) + for key in ("players", "referees"): + for p in tracks[key][i].values(): + x, y = self._ref_bottom_center(p["bbox"], H_inv) + xs.append(x) + ys.append(y) + if not xs: + return None + min_x, max_x = min(xs), max(xs) + min_y, max_y = min(ys), max(ys) + pad_x = 0.05 * max(1.0, max_x - min_x) + pad_y = 0.05 * max(1.0, max_y - min_y) + return min_x - pad_x, min_y - pad_y, max_x + pad_x, max_y + pad_y + + def _make_minimap_bg(self, mm_w, mm_h): + bg = np.full((mm_h, mm_w, 3), (40, 110, 40), dtype=np.uint8) + cv2.rectangle(bg, (2, 2), (mm_w - 3, mm_h - 3), (240, 240, 240), 2) + cv2.line(bg, (mm_w // 2, 2), (mm_w // 2, mm_h - 3), (240, 240, 240), 1) + cv2.circle(bg, (mm_w // 2, mm_h // 2), max(10, mm_h // 8), (240, 240, 240), 1) + return bg + + def _project_to_minimap(self, extent, mm_w, mm_h, x, y): + min_x, min_y, max_x, max_y = extent + dx = max(1e-6, max_x - min_x) + dy = max(1e-6, max_y - min_y) + scale = min((mm_w - 10) / dx, (mm_h - 10) / dy) + off_x = (mm_w - scale * dx) / 2.0 + off_y = (mm_h - scale * dy) / 2.0 + return int(off_x + (x - min_x) * scale), int(off_y + (y - min_y) * scale) + + def _smooth_points(self, pts, window): + if window <= 1 or len(pts) < 2: + return pts + pts = np.asarray(pts, dtype=np.float32) + n = len(pts) + half = window // 2 + smoothed = np.empty_like(pts) + for i in range(n): + lo = max(0, i - half) + hi = min(n, i + half + 1) + smoothed[i] = pts[lo:hi].mean(axis=0) + return smoothed + + def draw_trail(self, frame, points, color): + if len(points) < 2: + return frame + pts = np.array(points, dtype=np.int32).reshape(-1, 1, 2) + cv2.polylines( + frame, [pts], isClosed=False, color=color, thickness=2, lineType=cv2.LINE_AA + ) + return frame + + def _point_to_bbox_distance(self, px, py, bbox): + x1, y1, x2, y2 = bbox + dx = max(x1 - px, 0.0, px - x2) + dy = max(y1 - py, 0.0, py - y2) + return float(np.hypot(dx, dy)) + + def _ball_contact(self, player_dict, ball_bbox, contact_pad_ratio): + bx, by = get_center_of_bbox(ball_bbox) + best_tid, best_d, best_bbox = None, float("inf"), None + for tid, player in player_dict.items(): + d = self._point_to_bbox_distance(bx, by, player["bbox"]) + if d < best_d: + best_tid, best_d, best_bbox = tid, d, player["bbox"] + if best_bbox is None: + return None + w_box = best_bbox[2] - best_bbox[0] + h_box = best_bbox[3] - best_bbox[1] + if best_d > contact_pad_ratio * max(w_box, h_box): + return None + return best_tid + + def _count_total_contacts(self, tracks, contact_gap_frames, contact_pad_ratio): + totals = {} + last_contact_frame = {} + for frame_num, (player_dict, ball_dict) in enumerate( + zip(tracks["players"], tracks["ball"]) + ): + ball = ball_dict.get(1) + if ball is None or not player_dict: + continue + tid = self._ball_contact(player_dict, ball["bbox"], contact_pad_ratio) + if tid is None: + continue + last = last_contact_frame.get(tid) + if last is None or (frame_num - last) > contact_gap_frames: + totals[tid] = totals.get(tid, 0) + 1 + last_contact_frame[tid] = frame_num + return totals + + def draw_player_hud( + self, frame, player_id, contacts, distance_m, color, headshot=None + ): + x, y = 10, 10 + bg_color = (131, 41, 92) + text_color = (47, 186, 64) + if headshot is not None: + hh, hw = headshot.shape[:2] + w, h = hw + 280, max(110, hh + 20) + text_x = x + hw + 20 + else: + w, h = 320, 100 + text_x = x + 12 + cv2.rectangle(frame, (x, y), (x + w, y + h), bg_color, cv2.FILLED) + cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2) + if headshot is not None: + hy, hx = y + 10, x + 10 + frame[hy : hy + headshot.shape[0], hx : hx + headshot.shape[1]] = headshot + cv2.rectangle( + frame, + (hx, hy), + (hx + headshot.shape[1], hy + headshot.shape[0]), + color, + 2, + ) + cv2.putText( + frame, + "Player #8", + (text_x, y + 28), + cv2.FONT_HERSHEY_SIMPLEX, + 0.7, + text_color, + 2, + ) + cv2.putText( + frame, + f"Ball contacts: {contacts}", + (text_x, y + 58), + cv2.FONT_HERSHEY_SIMPLEX, + 0.6, + text_color, + 1, + ) + cv2.putText( + frame, + f"Distance: {distance_m:.1f} m", + (text_x, y + 85), + cv2.FONT_HERSHEY_SIMPLEX, + 0.6, + text_color, + 1, + ) + return frame + + def draw_annotations( + self, + video_frames, + tracks, + camera_motion=None, + trail_length=30, + contact_gap_frames=5, + contact_pad_ratio=0.25, + player_height_m=1.8, + headshot_path=None, + headshot_size=90, + logo_path=None, + logo_height=80, + logo_margin=15, + trail_smooth_window=11, + show_minimap=True, + minimap_size=(320, 200), + minimap_margin=15, + ): + output_video_frames = [] + player_trails = {} + team_votes = {} + team_bgr = {"red": (0, 0, 255), "blue": (255, 0, 0)} + default_color = (200, 200, 200) + + frames_count = {} + for frame_players in tracks["players"]: + for tid in frame_players: + frames_count[tid] = frames_count.get(tid, 0) + 1 + total_contacts = self._count_total_contacts( + tracks, contact_gap_frames, contact_pad_ratio + ) + + heights = [ + p["bbox"][3] - p["bbox"][1] + for frame_players in tracks["players"] + for p in frame_players.values() + if p["bbox"][3] > p["bbox"][1] + ] + px_per_meter = float(np.median(heights)) / player_height_m if heights else 1.0 + + headshot = None + if headshot_path is not None and os.path.exists(headshot_path): + img = cv2.imread(headshot_path) + if img is not None: + headshot = cv2.resize( + img, (headshot_size, headshot_size), interpolation=cv2.INTER_AREA + ) + + logo_bgr, logo_alpha = None, None + if logo_path is not None and os.path.exists(logo_path): + img = cv2.imread(logo_path, cv2.IMREAD_UNCHANGED) + if img is not None: + scale = logo_height / img.shape[0] + new_w = max(1, int(round(img.shape[1] * scale))) + img = cv2.resize( + img, (new_w, logo_height), interpolation=cv2.INTER_LANCZOS4 + ) + if img.ndim == 3 and img.shape[2] == 4: + logo_bgr = img[..., :3] + logo_alpha = (img[..., 3:4].astype(np.float32)) / 255.0 + else: + logo_bgr = ( + img if img.ndim == 3 else cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + ) + + minimap_bg, minimap_extent = None, None + if show_minimap: + minimap_extent = self._minimap_extent(tracks, camera_motion) + if minimap_extent is not None: + minimap_bg = self._make_minimap_bg(minimap_size[0], minimap_size[1]) + + if total_contacts: + focal_tid = max( + total_contacts, + key=lambda t: (total_contacts[t], frames_count.get(t, 0)), + ) + elif frames_count: + focal_tid = max(frames_count, key=frames_count.get) + else: + focal_tid = None + + last_ref_pt = {} + player_distance = {} + player_contacts = {} + last_contact_frame = {} + for frame_num, frame in enumerate(video_frames): + frame = frame.copy() + + player_dict = tracks["players"][frame_num] + ball_dict = tracks["ball"][frame_num] + referee_dict = tracks["referees"][frame_num] + + H_cum = camera_motion[frame_num] if camera_motion is not None else np.eye(3) + H_inv = np.linalg.inv(H_cum) + + active_ids = set(player_dict.keys()) + for track_id, player in player_dict.items(): + x_center, _ = get_center_of_bbox(player["bbox"]) + y_bottom = int(player["bbox"][3]) + ref_pt = cv2.perspectiveTransform( + np.array([[[x_center, y_bottom]]], dtype=np.float32), H_inv + )[0][0] + ref_tuple = (float(ref_pt[0]), float(ref_pt[1])) + player_trails.setdefault(track_id, []).append(ref_tuple) + if len(player_trails[track_id]) > trail_length: + player_trails[track_id] = player_trails[track_id][-trail_length:] + + if track_id in last_ref_pt: + dx = ref_tuple[0] - last_ref_pt[track_id][0] + dy = ref_tuple[1] - last_ref_pt[track_id][1] + player_distance[track_id] = player_distance.get( + track_id, 0.0 + ) + float(np.hypot(dx, dy)) + last_ref_pt[track_id] = ref_tuple + + vote = self.classify_jersey(frame, player["bbox"]) + if vote is not None: + counts = team_votes.setdefault(track_id, {"red": 0, "blue": 0}) + counts[vote] += 1 + for track_id in list(player_trails.keys()): + if track_id not in active_ids: + del player_trails[track_id] + last_ref_pt.pop(track_id, None) + + ball = ball_dict.get(1) + if ball is not None and player_dict: + tid = self._ball_contact(player_dict, ball["bbox"], contact_pad_ratio) + if tid is not None: + last = last_contact_frame.get(tid) + if last is None or (frame_num - last) > contact_gap_frames: + player_contacts[tid] = player_contacts.get(tid, 0) + 1 + last_contact_frame[tid] = frame_num + + focal_color = (131, 41, 92) + + def color_for(track_id): + if track_id == focal_tid: + return focal_color + counts = team_votes.get(track_id) + if not counts or (counts["red"] == 0 and counts["blue"] == 0): + return default_color + return ( + team_bgr["red"] + if counts["red"] >= counts["blue"] + else team_bgr["blue"] + ) + + for track_id, ref_points in player_trails.items(): + smoothed_ref = self._smooth_points(ref_points, trail_smooth_window) + pts = cv2.perspectiveTransform( + np.asarray(smoothed_ref, dtype=np.float32).reshape(-1, 1, 2), H_cum + ).reshape(-1, 2) + frame = self.draw_trail(frame, pts.tolist(), color_for(track_id)) + + for track_id, player in player_dict.items(): + frame = self.draw_ellipse(frame, player["bbox"], color_for(track_id)) + + for _, referee in referee_dict.items(): + frame = self.draw_ellipse(frame, referee["bbox"], (0, 255, 255)) + + for track_id, ball in ball_dict.items(): + frame = self.draw_traingle(frame, ball["bbox"], (0, 255, 0)) + + if focal_tid is not None: + frame = self.draw_player_hud( + frame, + focal_tid, + player_contacts.get(focal_tid, 0), + player_distance.get(focal_tid, 0.0) / px_per_meter, + color_for(focal_tid), + headshot=headshot, + ) + + if logo_bgr is not None: + lh, lw = logo_bgr.shape[:2] + fh, fw = frame.shape[:2] + x0 = max(0, fw - lw - logo_margin) + y0 = logo_margin + x1, y1 = x0 + lw, y0 + lh + if logo_alpha is not None: + roi = frame[y0:y1, x0:x1].astype(np.float32) + blended = ( + roi * (1.0 - logo_alpha) + + logo_bgr.astype(np.float32) * logo_alpha + ) + frame[y0:y1, x0:x1] = blended.astype(np.uint8) + else: + frame[y0:y1, x0:x1] = logo_bgr + + if minimap_bg is not None and minimap_extent is not None: + mm = minimap_bg.copy() + mm_w, mm_h = minimap_size + for tid, player in player_dict.items(): + rx, ry = self._ref_bottom_center(player["bbox"], H_inv) + mx, my = self._project_to_minimap( + minimap_extent, mm_w, mm_h, rx, ry + ) + dot_color = color_for(tid) + radius = 6 if tid == focal_tid else 4 + cv2.circle(mm, (mx, my), radius, dot_color, cv2.FILLED) + cv2.circle(mm, (mx, my), radius, (0, 0, 0), 1) + for referee in referee_dict.values(): + rx, ry = self._ref_bottom_center(referee["bbox"], H_inv) + mx, my = self._project_to_minimap( + minimap_extent, mm_w, mm_h, rx, ry + ) + cv2.circle(mm, (mx, my), 3, (0, 255, 255), cv2.FILLED) + cv2.circle(mm, (mx, my), 3, (0, 0, 0), 1) + ball = ball_dict.get(1) + if ball is not None: + bx, by = get_center_of_bbox(ball["bbox"]) + bref = cv2.perspectiveTransform( + np.array([[[bx, by]]], dtype=np.float32), H_inv + )[0][0] + mx, my = self._project_to_minimap( + minimap_extent, mm_w, mm_h, float(bref[0]), float(bref[1]) + ) + cv2.circle(mm, (mx, my), 4, (0, 255, 0), cv2.FILLED) + cv2.circle(mm, (mx, my), 4, (0, 0, 0), 1) + fh, fw = frame.shape[:2] + x0 = max(0, fw - mm_w - minimap_margin) + y0 = max(0, fh - mm_h - minimap_margin) + frame[y0 : y0 + mm_h, x0 : x0 + mm_w] = mm + + output_video_frames.append(frame) + + return output_video_frames + + +class FootballAnalyzer(GstBase.BaseTransform): + """ + Buffers every incoming video frame, then on EOS runs the full batch + pipeline (YOLO detection, ByteTrack with whole-clip class voting, + SIFT/RANSAC camera motion, annotated drawing with trails / HUD / + logo / minimap) and pushes the annotated frames downstream before + forwarding EOS. + """ + + __gstmetadata__ = ( + "Football Analyzer", + "Filter/Effect/Video", + "Runs football_analysis (YOLO + ByteTrack + SIFT camera motion + " + "annotated drawing) on the full clip and emits annotated frames on EOS", + "Marcus Edel ", + ) + + src_template = Gst.PadTemplate.new( + "src", + Gst.PadDirection.SRC, + Gst.PadPresence.ALWAYS, + VIDEO_CAPS.copy(), + ) + sink_template = Gst.PadTemplate.new( + "sink", + Gst.PadDirection.SINK, + Gst.PadPresence.ALWAYS, + VIDEO_CAPS.copy(), + ) + __gsttemplates__ = (src_template, sink_template) + + model_path = GObject.Property( + type=str, + default="", + nick="Model Path", + blurb="Path to the YOLO weights (must be set before processing)", + flags=GObject.ParamFlags.READWRITE, + ) + + headshot_path = GObject.Property( + type=str, + default="", + nick="Headshot Path", + blurb="Optional headshot image for the focal-player HUD", + flags=GObject.ParamFlags.READWRITE, + ) + + logo_path = GObject.Property( + type=str, + default="", + nick="Logo Path", + blurb="Optional top-right logo overlay", + flags=GObject.ParamFlags.READWRITE, + ) + + tracks_stub_path = GObject.Property( + type=str, + default="", + nick="Tracks Stub Path", + blurb="Optional pickle path for cached object tracks (read & written)", + flags=GObject.ParamFlags.READWRITE, + ) + + camera_motion_stub_path = GObject.Property( + type=str, + default="", + nick="Camera Motion Stub Path", + blurb="Optional pickle path for cached camera-motion homographies (read & written)", + flags=GObject.ParamFlags.READWRITE, + ) + + show_minimap = GObject.Property( + type=bool, + default=True, + nick="Show Minimap", + blurb="Render the bottom-right minimap overlay", + flags=GObject.ParamFlags.READWRITE, + ) + + def __init__(self): + super().__init__() + self.logger = LoggerFactory.get(LoggerFactory.LOGGER_TYPE_GST) + self._frames = [] + self._pts = [] + self._duration = [] + self._width = 0 + self._height = 0 + self._tracker = None + + def _ensure_tracker(self): + if self._tracker is not None: + return self._tracker + if not self.model_path or not os.path.exists(self.model_path): + raise FileNotFoundError(f"YOLO model not found: {self.model_path!r}") + self.logger.info(f"Loading FootballAnalyzer Tracker from {self.model_path}") + self._tracker = Tracker(self.model_path) + return self._tracker + + def do_set_caps(self, incaps, outcaps): + info = GstVideo.VideoInfo.new_from_caps(incaps) + self._width = info.width + self._height = info.height + return True + + def do_transform_ip(self, buf): + try: + ok, mapinfo = buf.map(Gst.MapFlags.READ) + if not ok: + self.logger.error("Failed to map incoming buffer for read") + return Gst.FlowReturn.ERROR + try: + frame = ( + np.frombuffer(mapinfo.data, dtype=np.uint8) + .reshape(self._height, self._width, 3) + .copy() + ) + finally: + buf.unmap(mapinfo) + + self._frames.append(frame) + self._pts.append(buf.pts) + self._duration.append(buf.duration) + return Gst.FlowReturn.OK + + except Exception as e: + self.logger.error(f"FootballAnalyzer chain error: {e}") + return Gst.FlowReturn.ERROR + + def do_sink_event(self, event): + if event.type == Gst.EventType.EOS: + try: + self._run_pipeline_and_push() + except Exception as e: + self.logger.error(f"FootballAnalyzer EOS processing failed: {e}") + # Forward EOS regardless so the pipeline shuts down cleanly. + return GstBase.BaseTransform.do_sink_event(self, event) + + def _run_pipeline_and_push(self): + if not self._frames: + self.logger.info("FootballAnalyzer: no frames buffered, skipping") + return + + tracker = self._ensure_tracker() + n = len(self._frames) + self.logger.info(f"FootballAnalyzer: running pipeline on {n} frames") + + tracks_stub = self.tracks_stub_path or None + cam_stub = self.camera_motion_stub_path or None + headshot = self.headshot_path or None + logo = self.logo_path or None + + tracks = tracker.get_object_tracks( + self._frames, + read_from_stub=tracks_stub is not None and os.path.exists(tracks_stub), + stub_path=tracks_stub, + ) + camera_motion = tracker.get_camera_motion( + self._frames, + tracks, + read_from_stub=cam_stub is not None and os.path.exists(cam_stub), + stub_path=cam_stub, + ) + annotated = tracker.draw_annotations( + self._frames, + tracks, + camera_motion=camera_motion, + headshot_path=headshot, + logo_path=logo, + show_minimap=self.show_minimap, + ) + + if len(annotated) != n: + self.logger.warning( + f"draw_annotations returned {len(annotated)} frames for {n} inputs; " + "padding/truncating to match" + ) + if len(annotated) < n: + annotated = list(annotated) + [annotated[-1]] * (n - len(annotated)) + else: + annotated = annotated[:n] + + srcpad = self.srcpad + for i, out in enumerate(annotated): + data = np.ascontiguousarray(out, dtype=np.uint8).tobytes() + outbuf = Gst.Buffer.new_allocate(None, len(data), None) + outbuf.fill(0, data) + outbuf.pts = self._pts[i] + outbuf.duration = self._duration[i] + ret = srcpad.push(outbuf) + if ret != Gst.FlowReturn.OK: + self.logger.error( + f"Pushing annotated frame {i} failed with {ret}; aborting" + ) + break + + self._frames.clear() + self._pts.clear() + self._duration.clear() + + +if CAN_REGISTER_ELEMENT: + GObject.type_register(FootballAnalyzer) + __gstelementfactory__ = ( + "pyml_football_analyzer", + Gst.Rank.NONE, + FootballAnalyzer, + ) +else: + GlobalLogger().warning( + "The 'pyml_football_analyzer' element will not be registered because " + "required modules are missing." + ) diff --git a/plugins/python/football_overlay.py b/plugins/python/football_overlay.py new file mode 100644 index 0000000..9dc4f42 --- /dev/null +++ b/plugins/python/football_overlay.py @@ -0,0 +1,1135 @@ +# FootballOverlay +# Copyright (C) 2024-2026 Collabora Ltd. +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Library General Public +# License as published by the Free Software Foundation; either +# version 2 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Library General Public License for more details. +# +# You should have received a copy of the GNU Library General Public +# License along with this library; if not, write to the +# Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, +# Boston, MA 02110-1301, USA. + +import os + +from log.global_logger import GlobalLogger + +CAN_REGISTER_ELEMENT = True +try: + import re + import gi + + gi.require_version("Gst", "1.0") + gi.require_version("GstBase", "1.0") + gi.require_version("GstVideo", "1.0") + gi.require_version("GstAnalytics", "1.0") + gi.require_version("GLib", "2.0") + from gi.repository import ( + Gst, + GstBase, + GstVideo, + GstAnalytics, + GObject, + GLib, + ) # noqa: E402 + + from log.logger_factory import LoggerFactory # noqa: E402 + + OVERLAY_CAPS = Gst.Caps.from_string( + "video/x-raw, format=(string){ RGBA, ARGB, BGRA, ABGR }" + ) + +except ImportError as e: + CAN_REGISTER_ELEMENT = False + GlobalLogger().warning( + f"The 'pyml_football_overlay' element will not be available. Error: {e}" + ) + + +_FORMAT_ORDER = { + "RGBA": (0, 1, 2, 3), + "ARGB": (3, 0, 1, 2), + "BGRA": (2, 1, 0, 3), + "ABGR": (3, 2, 1, 0), +} + +_PALETTE = [ + (239, 71, 111, 255), + (255, 209, 102, 255), + (6, 214, 160, 255), + (17, 138, 178, 255), + (255, 107, 107, 255), + (78, 205, 196, 255), + (199, 125, 255, 255), + (255, 159, 28, 255), + (46, 196, 182, 255), + (118, 200, 247, 255), +] + +_REFEREE_RGBA = (255, 215, 0, 255) +_BALL_RGBA = (0, 230, 0, 255) +_PLAYER_RGBA = (0, 200, 255, 255) +_RED_TEAM_RGBA = (255, 40, 40, 255) +_BLUE_TEAM_RGBA = (40, 90, 255, 255) +_DEFAULT_RGBA = (235, 235, 235, 255) +_BLACK_RGBA = (0, 0, 0, 255) +_HUD_BG_RGBA = (92, 41, 131, 255) +_HUD_TEXT_RGBA = (64, 186, 47, 255) +_HIGHLIGHT_RGBA = (255, 255, 255, 255) + + +def _is_ball(label): + return "ball" in label + + +def _is_referee(label): + return "referee" in label or label == "ref" + + +class FootballOverlay(GstBase.BaseTransform): + """ + Metadata-driven broadcast overlay (football_analysis style), streaming. + + Reads upstream GstAnalytics detection/tracking metadata and draws: an + ellipse + optional id badge per subject, a gold ellipse for referees, a + green triangle on the ball, fading motion trails, and a focal-player HUD + with a headshot, accumulated ball contacts, and distance travelled. + """ + + __gstmetadata__ = ( + "Football Overlay", + "Filter/Effect/Video", + "Broadcast-style detection/tracking overlay (ellipses, ball triangle, " + "trails, headshot HUD with ball contacts + distance) from GstAnalytics", + "Marcus Edel ", + ) + + src_template = Gst.PadTemplate.new( + "src", Gst.PadDirection.SRC, Gst.PadPresence.ALWAYS, OVERLAY_CAPS.copy() + ) + sink_template = Gst.PadTemplate.new( + "sink", Gst.PadDirection.SINK, Gst.PadPresence.ALWAYS, OVERLAY_CAPS.copy() + ) + __gsttemplates__ = (src_template, sink_template) + + show_labels = GObject.Property( + type=bool, + default=True, + nick="Show Labels", + blurb="Draw the class name above each object", + flags=GObject.ParamFlags.READWRITE, + ) + show_ids = GObject.Property( + type=bool, + default=True, + nick="Show Track IDs", + blurb="Draw the track-id badge under each tracked object", + flags=GObject.ParamFlags.READWRITE, + ) + trails = GObject.Property( + type=bool, + default=True, + nick="Show Trails", + blurb="Draw a fading motion trail behind each tracked object", + flags=GObject.ParamFlags.READWRITE, + ) + trail_length = GObject.Property( + type=int, + default=30, + minimum=2, + maximum=300, + nick="Trail Length", + blurb="Number of recent positions kept in each motion trail", + flags=GObject.ParamFlags.READWRITE, + ) + show_ball = GObject.Property( + type=bool, + default=False, + nick="Show Ball", + blurb="Draw the marker on the ball (the ball is still tracked for " + "contact counting either way)", + flags=GObject.ParamFlags.READWRITE, + ) + show_hud = GObject.Property( + type=bool, + default=True, + nick="Show HUD", + blurb="Draw the focal-player HUD (headshot, label, contacts, distance)", + flags=GObject.ParamFlags.READWRITE, + ) + headshot_path = GObject.Property( + type=str, + default="data/Chinedu-Obasi_2684938.jpg", + nick="Headshot Path", + blurb="Image shown in the HUD (empty to disable)", + flags=GObject.ParamFlags.READWRITE, + ) + headshot_size = GObject.Property( + type=int, + default=90, + minimum=16, + maximum=512, + nick="Headshot Size", + blurb="Headshot square size in pixels", + flags=GObject.ParamFlags.READWRITE, + ) + player_label = GObject.Property( + type=str, + default="Player #8", + nick="Player Label", + blurb="Static label drawn in the HUD", + flags=GObject.ParamFlags.READWRITE, + ) + contact_pad_ratio = GObject.Property( + type=float, + default=0.25, + minimum=0.0, + maximum=5.0, + nick="Contact Pad Ratio", + blurb="Ball counts as a contact within this fraction of the player box size", + flags=GObject.ParamFlags.READWRITE, + ) + contact_gap_frames = GObject.Property( + type=int, + default=5, + minimum=0, + maximum=1000, + nick="Contact Gap Frames", + blurb="Min frames between counted contacts for the same player", + flags=GObject.ParamFlags.READWRITE, + ) + player_height = GObject.Property( + type=float, + default=1.8, + minimum=0.1, + maximum=10.0, + nick="Player Height (m)", + blurb="Assumed real-world height used to convert pixels to metres", + flags=GObject.ParamFlags.READWRITE, + ) + min_confidence = GObject.Property( + type=float, + default=0.0, + minimum=0.0, + maximum=1.0, + nick="Min Confidence", + blurb="Skip detections whose confidence is below this threshold", + flags=GObject.ParamFlags.READWRITE, + ) + class_names = GObject.Property( + type=str, + default="", + nick="Class Names", + blurb="Comma-separated names to map numeric labels (label_N) from the " + "onnx/objectdetector path, e.g. 'ball,goalkeeper,player,referee'", + flags=GObject.ParamFlags.READWRITE, + ) + team_colors = GObject.Property( + type=bool, + default=True, + nick="Team Colors", + blurb="Colour players by jersey team (red/blue, per-track majority vote); " + "off draws all players one colour", + flags=GObject.ParamFlags.READWRITE, + ) + draw_from_detections = GObject.Property( + type=bool, + default=False, + nick="Draw From Detections", + blurb="Draw ellipses on the raw per-frame detection boxes instead of the " + "tracker's boxes -- no Kalman drift, coasted phantoms or track-split " + "doubles. Team colour is then classified per frame; the HUD still uses " + "tracker metadata if present", + flags=GObject.ParamFlags.READWRITE, + ) + merge_iou = GObject.Property( + type=float, + default=0.5, + minimum=0.0, + maximum=1.0, + nick="Merge IoU", + blurb="Collapse overlapping boxes (across classes) into one before " + "drawing, so one player isn't circled twice; a box is merged when its " + "IoU or containment with a kept box exceeds this (0 disables)", + flags=GObject.ParamFlags.READWRITE, + ) + position_smoothing = GObject.Property( + type=float, + default=0.5, + minimum=0.0, + maximum=0.95, + nick="Position Smoothing", + blurb="Temporal EMA on drawn box positions (0=off, higher=smoother but " + "more lag). Boxes are associated frame-to-frame by proximity, so this " + "damps detection jitter and the steps from a detection interval > 1", + flags=GObject.ParamFlags.READWRITE, + ) + highlight_focal = GObject.Property( + type=bool, + default=True, + nick="Highlight Focal Player", + blurb="Mark the focal player (the one shown in the HUD) on the pitch " + "with a chevron above their head and a bolder ellipse", + flags=GObject.ParamFlags.READWRITE, + ) + focal_track_id = GObject.Property( + type=int, + default=-1, + minimum=-1, + maximum=100000, + nick="Focal Track ID", + blurb="Pin the focal/highlighted player to this track id; -1 = auto " + "(the player tracked the most, with hysteresis so it stays stable)", + flags=GObject.ParamFlags.READWRITE, + ) + + def __init__(self): + super().__init__() + self.logger = LoggerFactory.get(LoggerFactory.LOGGER_TYPE_GST) + self.set_in_place(True) + self.width = 0 + self.height = 0 + self._order = _FORMAT_ORDER["RGBA"] + # per-track state, accumulated across frames + self._trail = {} + self._last_pt = {} + self._distance_px = {} + self._heights = [] + self._widths = [] + self._ell_w = {} # track_id -> smoothed ellipse half-width (px) + self._contacts = {} + self._last_contact_frame = {} + self._frames_seen = {} + self._track_label = {} + self._class_votes = {} # track_id -> {label: count}, for stable class + self._team_votes = {} # track_id -> {"red": n, "blue": n}, jersey team + self._frame = 0 + self._focal = None # current focal track id (sticky, for hysteresis) + self._headshot = None + self._headshot_loaded = False + self._inv_order = [0, 1, 2, 3] # buffer-channel -> logical RGBA index + # Position-smoothing slots: {"box": np[x1,y1,x2,y2]} kept across frames + # and matched by proximity, so the drawn ellipse can be low-passed. + self._smooth_slots = [] + + def do_set_caps(self, incaps, outcaps): + info = GstVideo.VideoInfo.new_from_caps(incaps) + self.width = info.width + self.height = info.height + fmt = info.finfo.name if info.finfo else "RGBA" + self._order = _FORMAT_ORDER.get(fmt, _FORMAT_ORDER["RGBA"]) + # buffer channel j holds logical[self._order[j]]; invert so we can pull + # logical R,G,B out of the buffer for jersey colour classification. + self._inv_order = [self._order.index(c) for c in range(4)] + self._headshot_loaded = False # re-load in the new channel order + self.logger.info(f"FootballOverlay caps: {fmt} {self.width}x{self.height}") + return True + + def _map_label(self, label): + if self.class_names: + m = re.match(r"label_(\d+)$", label) + if m: + names = [s.strip() for s in self.class_names.split(",") if s.strip()] + i = int(m.group(1)) + if 0 <= i < len(names): + return names[i] + return label + + def _parse_label(self, full_label): + core = full_label + m = re.match(r"stream_\d+_(.*)$", full_label) + if m: + core = m.group(1) + m = re.match(r"(.+)_id_(\d+)$", core) + if m: + return self._map_label(m.group(1)), int(m.group(2)) + m = re.match(r"id_(\d+)$", core) + if m: + return "object", int(m.group(1)) + return self._map_label(core or "object"), None + + def _read_metadata(self, buf): + entries = [] + meta = GstAnalytics.buffer_get_analytics_relation_meta(buf) + if not meta: + return entries + for index in range(GstAnalytics.relation_get_length(meta)): + ret, od_mtd = meta.get_od_mtd(index) + if not ret or od_mtd is None: + continue + full_label = GLib.quark_to_string(od_mtd.get_obj_type()) + presence, x, y, w, h, score = od_mtd.get_location() + if not presence: + continue + label, track_id = self._parse_label(full_label) + entries.append( + { + "label": label.lower(), + "track_id": track_id, + "confidence": score, + "box": (x, y, x + w, y + h), + } + ) + return entries + + @staticmethod + def _point_to_bbox_distance(px, py, box): + x1, y1, x2, y2 = box + dx = max(x1 - px, 0.0, px - x2) + dy = max(y1 - py, 0.0, py - y2) + return (dx * dx + dy * dy) ** 0.5 + + def _ball_contact(self, players, ball_box): + """Closest player to the ball, if within contact_pad_ratio of its size.""" + bx = (ball_box[0] + ball_box[2]) / 2.0 + by = (ball_box[1] + ball_box[3]) / 2.0 + best_tid, best_d, best_box = None, float("inf"), None + for tid, box in players.items(): + d = self._point_to_bbox_distance(bx, by, box) + if d < best_d: + best_tid, best_d, best_box = tid, d, box + if best_box is None: + return None + w = best_box[2] - best_box[0] + h = best_box[3] - best_box[1] + if best_d > self.contact_pad_ratio * max(w, h): + return None + return best_tid + + def _update_tracks(self, entries, det_ball_box=None): + self._frame += 1 + active = set() + players = {} + ball_box = None + # Accumulate per-track class votes first so the stable label below + # already reflects this frame. + for e in entries: + tid = e["track_id"] + if tid is None: + continue + v = self._class_votes.setdefault(tid, {}) + v[e["label"]] = v.get(e["label"], 0) + 1 + for e in entries: + tid = e["track_id"] + if tid is None: + continue + label = self._stable_label(tid, e["label"]) + if _is_ball(label): + ball_box = e["box"] + continue + active.add(tid) + players[tid] = e["box"] + self._track_label[tid] = label + self._frames_seen[tid] = self._frames_seen.get(tid, 0) + 1 + x1, y1, x2, y2 = e["box"] + foot = (int((x1 + x2) / 2), int(y2)) + if y2 - y1 > 0: + self._heights.append(y2 - y1) + if len(self._heights) > 600: + self._heights = self._heights[-600:] + self._update_ellipse_width(tid, x2 - x1) + prev = self._last_pt.get(tid) + if prev is not None: + self._distance_px[tid] = ( + self._distance_px.get(tid, 0.0) + + ((foot[0] - prev[0]) ** 2 + (foot[1] - prev[1]) ** 2) ** 0.5 + ) + self._last_pt[tid] = foot + trail = self._trail.setdefault(tid, []) + trail.append(foot) + if len(trail) > self.trail_length: + del trail[: -self.trail_length] + + # Fall back to the detected ball if no tracked ball this frame. + if ball_box is None: + ball_box = det_ball_box + + # Ball contacts (debounced per player), like football_analyzer. + if ball_box is not None and players: + tid = self._ball_contact(players, ball_box) + if tid is not None: + last = self._last_contact_frame.get(tid) + if last is None or (self._frame - last) > self.contact_gap_frames: + self._contacts[tid] = self._contacts.get(tid, 0) + 1 + self._last_contact_frame[tid] = self._frame + + for tid in list(self._trail.keys()): + if tid not in active: + del self._trail[tid] + self._last_pt.pop(tid, None) + self._ell_w.pop(tid, None) + return active + + def _update_ellipse_width(self, track_id, raw_w): + # Smooth (and outlier-reject) the per-track ellipse width so a single + # oversized box -- two players merged, or a drifting keep-alive + # prediction -- can't balloon the circle for one frame. + if raw_w <= 0: + return + if self._widths: + self._widths.append(raw_w) + if len(self._widths) > 600: + self._widths = self._widths[-600:] + srt = sorted(self._widths) + med = srt[len(srt) // 2] + clamped = min(max(raw_w, 0.5 * med), 1.8 * med) + else: + self._widths.append(raw_w) + clamped = raw_w + prev = self._ell_w.get(track_id) + # EMA: slow enough to keep the circle size steady frame-to-frame, fast + # enough to still follow real perspective changes as players move. + self._ell_w[track_id] = ( + clamped if prev is None else 0.25 * clamped + 0.75 * prev + ) + + def _px_per_meter(self): + if not self._heights: + return None + import numpy as np + + return float(np.median(self._heights)) / max(0.1, self.player_height) + + def _focal_track(self): + # Pin to an explicit track id if requested. + if self.focal_track_id >= 0: + return ( + self.focal_track_id + if self.focal_track_id in self._frames_seen + else self._focal + ) + keys = set(self._frames_seen) + if not keys: + return None + + # Only consider *sustained* tracks. Otherwise a track that flickered for + # a few frames -- common when detection/tracking churns -- can win on a + # single ball contact and then show ~0 distance (it was barely tracked). + # The floor scales with elapsed frames, with a small absolute minimum. + floor = max(10, int(0.2 * self._frame)) + candidates = [t for t in keys if self._frames_seen.get(t, 0) >= floor] or list( + keys + ) + + # Rank by ball contacts (the player most involved with the ball), with + # frames-seen as a tiebreak / pre-contact fallback (before anyone has + # touched the ball, the most-tracked player is shown). + def score(t): + return (self._contacts.get(t, 0), self._frames_seen.get(t, 0)) + + best = max(candidates, key=score) + # Stability: keep the current focal unless a challenger has *strictly + # more* contacts, so the highlight/HUD don't flip on ties or noise. + cur = self._focal + if ( + cur is not None + and cur in candidates + and self._contacts.get(best, 0) <= self._contacts.get(cur, 0) + ): + best = cur + self._focal = best + return best + + def _stable_label(self, track_id, fallback=""): + # Majority-voted class over the track's history — smooths frame-to-frame + # misclassifications (e.g. a player briefly tagged 'referee'), so the + # gold referee marking doesn't flicker. + votes = self._class_votes.get(track_id) + if not votes: + return fallback + return max(votes, key=votes.get) + + def _c(self, rgba): + return tuple(rgba[i] for i in self._order) + + def _team_color(self, track_id): + # Confident kit colour from a track's accumulated jersey votes, else + # None. Red/blue -> team; "ref" (distinctive non-team kit) -> gold. + # Requires a minimum number of votes AND a clear majority, so a few + # noisy frames can't decide the colour. + if track_id is None: + return None + c = self._team_votes.get(track_id) + if not c: + return None + red, blue, ref = c.get("red", 0), c.get("blue", 0), c.get("ref", 0) + total = red + blue + ref + if total < 4: + return None + colors = {_RED_TEAM_RGBA: red, _BLUE_TEAM_RGBA: blue, _REFEREE_RGBA: ref} + color, n = max(colors.items(), key=lambda kv: kv[1]) + return color if n >= 0.6 * total else None + + def _is_referee_track(self, track_id, fallback_label): + # A track is a referee only if referee *clearly dominates* its class + # votes. Referees are rare, so a mostly-player track with a few stray + # 'referee' mislabels stays a player (won't get the gold circle). + votes = self._class_votes.get(track_id) if track_id is not None else None + if not votes: + return _is_referee(fallback_label) + total = sum(votes.values()) + ref = sum(c for lbl, c in votes.items() if _is_referee(lbl)) + return total > 0 and ref >= 3 and ref >= 0.6 * total + + def _color_for(self, label, track_id): + # Colour by the track's *accumulated* jersey team (robust to per-frame + # noise). Referee/player only decides the fallback when the team is + # undecided: a referee keeps gold (stays visible), a player isn't drawn. + if _is_ball(label): + return _BALL_RGBA + if self.team_colors: + team = self._team_color(track_id) + if team is not None: + return team + return _REFEREE_RGBA if self._is_referee_track(track_id, label) else None + return _REFEREE_RGBA if _is_referee(label) else _PLAYER_RGBA + + @staticmethod + def _overlap(a, b): + # max(IoU, intersection-over-smaller-area): catches both heavy overlap + # and a small duplicate box sitting inside a larger one. + ax1, ay1, ax2, ay2 = a + bx1, by1, bx2, by2 = b + iw = max(0.0, min(ax2, bx2) - max(ax1, bx1)) + ih = max(0.0, min(ay2, by2) - max(ay1, by1)) + inter = iw * ih + if inter <= 0.0: + return 0.0 + area_a = max(0.0, ax2 - ax1) * max(0.0, ay2 - ay1) + area_b = max(0.0, bx2 - bx1) * max(0.0, by2 - by1) + union = area_a + area_b - inter + iou = inter / union if union > 0.0 else 0.0 + smaller = min(area_a, area_b) + contain = inter / smaller if smaller > 0.0 else 0.0 + return max(iou, contain) + + @staticmethod + def _feet_close(a, b): + # True when two boxes' foot points (bottom-centre, where the ellipse is + # drawn) are within ~0.4 of the smaller box width. The ellipse is ~2x the + # box width, so near-coincident feet = one player circled twice even when + # the boxes' IoU is low. Genuinely adjacent players are ~a full width + # apart at the feet, so they're not merged. + fax, fay = (a[0] + a[2]) / 2.0, a[3] + fbx, fby = (b[0] + b[2]) / 2.0, b[3] + ref = max(1.0, min(a[2] - a[0], b[2] - b[0])) + return ((fax - fbx) ** 2 + (fay - fby) ** 2) ** 0.5 < 0.4 * ref + + def _merge_overlaps(self, entries): + # Class-agnostic greedy suppression: keep the most confident box, drop + # any later box that overlaps it past merge_iou OR sits at the same feet. + # Collapses a player circled twice (e.g. player+goalkeeper on one person, + # or two offset boxes) into one. The ball is never merged against players. + if self.merge_iou <= 0.0 or len(entries) < 2: + return entries + ordered = sorted(entries, key=lambda e: e["confidence"], reverse=True) + kept = [] + for e in ordered: + if _is_ball(e["label"]): + kept.append(e) + continue + if any( + not _is_ball(k["label"]) + and ( + self._overlap(e["box"], k["box"]) >= self.merge_iou + or self._feet_close(e["box"], k["box"]) + ) + for k in kept + ): + continue + kept.append(e) + return kept + + def _assign_track_ids(self, draw_entries, track_entries): + # Give each drawn box a stable track id: track-mode boxes already carry + # one; detection-mode boxes borrow the id of the best-overlapping track + # (greedy, each track used once) so detection circles can use the + # tracker's persistent id for the badge and the accumulated colour. + ids = [e["track_id"] for e in draw_entries] + if not track_entries: + return ids + pairs = [] + for di, e in enumerate(draw_entries): + if e["track_id"] is not None or _is_ball(e["label"]): + continue + for t in track_entries: + if _is_ball(t["label"]): + continue + ov = self._overlap(e["box"], t["box"]) + if ov >= 0.3: + pairs.append((ov, di, t["track_id"])) + pairs.sort(key=lambda p: p[0], reverse=True) + used_draw, used_track = set(), set() + for _ov, di, tid in pairs: + if di in used_draw or tid in used_track: + continue + ids[di] = tid + used_draw.add(di) + used_track.add(tid) + return ids + + def _smooth_boxes(self, np, entries): + # Temporal EMA on the boxes we're about to draw. Each box is matched to + # the nearest slot from last frame (by centre, within a size-relative + # gate) and pulled toward the new detection; slots not matched this + # frame are dropped (no phantoms). Damps jitter and interval steps. The + # ball is passed through unsmoothed so it never lags. + a = float(self.position_smoothing) + if a <= 0.0 or not entries: + return entries + used = set() + out = [] + for e in entries: + if _is_ball(e["label"]): + out.append(e) + continue + box = np.array(e["box"], dtype=np.float64) + cx, cy = (box[0] + box[2]) / 2.0, (box[1] + box[3]) / 2.0 + # Generous gate so a coherent interval-step jump still associates + # (and glides) without grabbing a different nearby player. + gate = 1.5 * max(box[2] - box[0], box[3] - box[1], 1.0) + best, best_d = None, gate + for idx, slot in enumerate(self._smooth_slots): + if idx in used: + continue + sb = slot["box"] + d = ( + ((sb[0] + sb[2]) / 2.0 - cx) ** 2 + + ((sb[1] + sb[3]) / 2.0 - cy) ** 2 + ) ** 0.5 + if d < best_d: + best, best_d = idx, d + if best is None: + self._smooth_slots.append({"box": box.copy()}) + used.add(len(self._smooth_slots) - 1) + smoothed = box + else: + used.add(best) + slot = self._smooth_slots[best] + slot["box"] = a * slot["box"] + (1.0 - a) * box + smoothed = slot["box"] + ne = dict(e) + ne["box"] = ( + float(smoothed[0]), + float(smoothed[1]), + float(smoothed[2]), + float(smoothed[3]), + ) + out.append(ne) + self._smooth_slots = [s for i, s in enumerate(self._smooth_slots) if i in used] + return out + + def _detection_color(self, cv2, np, frame, label, box): + # Colour a raw detection box (no track id) by its jersey team, classified + # from this frame -- referees included. When the jersey isn't clearly a + # team colour, a referee falls back to gold (so real refs stay visible) + # and a player isn't drawn (matching the track-mode behaviour). + ref = _is_referee(label) + if not self.team_colors: + return _REFEREE_RGBA if ref else _PLAYER_RGBA + vote = self._classify_jersey(cv2, np, frame, box) + if vote == "red": + return _RED_TEAM_RGBA + if vote == "blue": + return _BLUE_TEAM_RGBA + if vote == "ref": + return _REFEREE_RGBA + return _REFEREE_RGBA if ref else None + + def _classify_jersey(self, cv2, np, frame, box): + # Dominant jersey colour in the torso patch -> "red"/"blue"/"ref"/None + # (HSV). "ref" is a distinctive non-team kit colour (yellow/orange or + # pink/magenta) -- chosen to avoid grass-green and the red/blue teams -- + # so the referee is identified by its kit colour, not the class label. + x1, y1, x2, y2 = (int(v) for v in box) + h_box, w_box = y2 - y1, x2 - x1 + if h_box <= 0 or w_box <= 0: + return None + jy1, jy2 = y1 + int(0.15 * h_box), y1 + int(0.55 * h_box) + jx1, jx2 = x1 + int(0.25 * w_box), x1 + int(0.75 * w_box) + H, W = frame.shape[:2] + jy1, jy2 = max(0, jy1), min(H, jy2) + jx1, jx2 = max(0, jx1), min(W, jx2) + if jy2 - jy1 < 3 or jx2 - jx1 < 3: + return None + # logical RGB from the buffer's channel order, then HSV + rgb = np.ascontiguousarray(frame[jy1:jy2, jx1:jx2][:, :, self._inv_order[:3]]) + hsv = cv2.cvtColor(rgb, cv2.COLOR_RGB2HSV) + s_v = (hsv[..., 1] > 80) & (hsv[..., 2] > 50) + h = hsv[..., 0] + red = int((((h <= 10) | (h >= 170)) & s_v).sum()) + blue = int(((h >= 100) & (h <= 130) & s_v).sum()) + # Referee kit: yellow/orange (~18-34) or pink/magenta (~145-165). These + # bands skip grass-green (~40-90) and the red/blue team bands. + ref = int(((((h >= 18) & (h <= 34)) | ((h >= 145) & (h <= 165))) & s_v).sum()) + min_pixels = max(20, int(0.02 * rgb.shape[0] * rgb.shape[1])) + counts = {"red": red, "blue": blue, "ref": ref} + best = max(counts, key=counts.get) + if counts[best] < min_pixels: + return None + return best + + def _load_headshot(self, cv2, np): + if self._headshot_loaded: + return self._headshot + self._headshot_loaded = True + self._headshot = None + path = self.headshot_path + if not path or not os.path.exists(path): + if path: + self.logger.warning(f"headshot not found: {path}") + return None + img = cv2.imread(path) # BGR + if img is None: + return None + sz = int(self.headshot_size) + img = cv2.resize(img, (sz, sz), interpolation=cv2.INTER_AREA) + rgb = img[:, :, ::-1] # BGR -> RGB + alpha = np.full((sz, sz, 1), 255, dtype=np.uint8) + rgba = np.concatenate([rgb, alpha], axis=2).astype(np.uint8) # logical RGBA + + self._headshot = np.ascontiguousarray(rgba[:, :, list(self._order)]) + return self._headshot + + def _draw_trail(self, cv2, np, frame, points, rgba): + if len(points) < 2: + return + pts = np.array(points, dtype=np.int32).reshape(-1, 1, 2) + cv2.polylines(frame, [pts], False, self._c(rgba), 2, cv2.LINE_AA) + + def _draw_ellipse(self, cv2, frame, box, rgba, track_id): + x1, y1, x2, y2 = box + y_bottom = int(y2) + x_center = int((x1 + x2) / 2) + # Prefer the per-track smoothed width so the ellipse stays stable even + # when a single detection box is momentarily oversized. + smoothed = self._ell_w.get(track_id) + width = max(1, int(smoothed if smoothed is not None else x2 - x1)) + color = self._c(rgba) + cv2.ellipse( + frame, + (x_center, y_bottom), + (width, max(1, int(0.35 * width))), + 0.0, + -45, + 235, + color, + 2, + cv2.LINE_AA, + ) + if self.show_ids and track_id is not None: + rect_w, rect_h = 40, 18 + x1r = x_center - rect_w // 2 + x2r = x_center + rect_w // 2 + y1r = y_bottom - rect_h // 2 + 15 + y2r = y_bottom + rect_h // 2 + 15 + cv2.rectangle(frame, (x1r, y1r), (x2r, y2r), color, cv2.FILLED) + tx = x1r + 12 - (10 if track_id > 99 else 0) + cv2.putText( + frame, + str(track_id), + (tx, y1r + 14), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + self._c(_BLACK_RGBA), + 2, + cv2.LINE_AA, + ) + + def _draw_triangle(self, cv2, np, frame, box, rgba): + x1, y1, x2, y2 = box + x = int((x1 + x2) / 2) + y = int(y1) + pts = np.array([[x, y], [x - 10, y - 20], [x + 10, y - 20]], dtype=np.int32) + cv2.drawContours(frame, [pts], 0, self._c(rgba), cv2.FILLED) + cv2.drawContours(frame, [pts], 0, self._c(_BLACK_RGBA), 2) + + def _draw_focal_marker(self, cv2, np, frame, box): + # Broadcast-style "selected player" chevron floating above the head, + # plus a bolder ellipse, to flag the focal (HUD) player on the pitch. + x1, y1, x2, y2 = box + cx = int((x1 + x2) / 2) + tip_y = int(y1) - 10 + s = 16 + pts = np.array( + [ + [cx, tip_y], + [cx - s, tip_y - int(s * 1.5)], + [cx + s, tip_y - int(s * 1.5)], + ], + dtype=np.int32, + ) + cv2.drawContours(frame, [pts], 0, self._c(_HIGHLIGHT_RGBA), cv2.FILLED) + cv2.drawContours(frame, [pts], 0, self._c(_BLACK_RGBA), 2) + # Bolder ring at the feet to reinforce the selection. + x_center = int((x1 + x2) / 2) + width = max(1, int(x2 - x1)) + cv2.ellipse( + frame, + (x_center, int(y2)), + (width, max(1, int(0.35 * width))), + 0.0, + -45, + 235, + self._c(_HIGHLIGHT_RGBA), + 4, + cv2.LINE_AA, + ) + + def _draw_label(self, cv2, frame, box, label, rgba): + x1, y1, _, _ = box + cv2.putText( + frame, + label, + (int(x1), max(12, int(y1) - 6)), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + self._c(rgba), + 1, + cv2.LINE_AA, + ) + + def _draw_hud(self, cv2, frame, contacts, distance_m, rgba, headshot): + font = cv2.FONT_HERSHEY_SIMPLEX + x, y = 10, 10 + if headshot is not None: + hh, hw = headshot.shape[:2] + w, h = hw + 280, max(110, hh + 20) + text_x = x + hw + 20 + else: + w, h = 320, 100 + text_x = x + 12 + cv2.rectangle(frame, (x, y), (x + w, y + h), self._c(_HUD_BG_RGBA), cv2.FILLED) + cv2.rectangle(frame, (x, y), (x + w, y + h), self._c(rgba), 2) + if headshot is not None: + hy, hx = y + 10, x + 10 + fh, fw = frame.shape[:2] + hh = min(hh, fh - hy) + hw = min(hw, fw - hx) + if hh > 0 and hw > 0: + frame[hy : hy + hh, hx : hx + hw] = headshot[:hh, :hw] + cv2.rectangle(frame, (hx, hy), (hx + hw, hy + hh), self._c(rgba), 2) + tc = self._c(_HUD_TEXT_RGBA) + cv2.putText( + frame, self.player_label, (text_x, y + 28), font, 0.7, tc, 2, cv2.LINE_AA + ) + cv2.putText( + frame, + f"Ball contacts: {contacts}", + (text_x, y + 58), + font, + 0.6, + tc, + 1, + cv2.LINE_AA, + ) + cv2.putText( + frame, + f"Distance: {distance_m:.1f} m", + (text_x, y + 85), + font, + 0.6, + tc, + 1, + cv2.LINE_AA, + ) + + def do_transform_ip(self, buf): + try: + import numpy as np + + all_entries = self._read_metadata(buf) + # The buffer carries both the detector's boxes (track_id None) and + # the tracker's boxes (track_id set). Tracking state/HUD always use + # the tracked entries; what we *draw* depends on draw_from_detections. + track_entries = [e for e in all_entries if e["track_id"] is not None] + det_entries = [e for e in all_entries if e["track_id"] is None] + + # Ball position for contact counting: prefer a tracked ball, else + # fall back to the strongest ball *detection* (the ball is small and + # fast, so it often isn't tracked) -- so contacts still get counted. + det_ball_box = None + best_ball = -1.0 + for e in det_entries: + if _is_ball(e["label"]) and e["confidence"] > best_ball: + best_ball, det_ball_box = e["confidence"], e["box"] + + # Per-track state (votes, contacts, distance, focal) from the tracker. + active = self._update_tracks( + track_entries if track_entries else all_entries, det_ball_box + ) + + if self.draw_from_detections: + draw_entries = list(det_entries) + # Bridge missed detections: the detector occasionally drops a + # player for a frame, which would flicker the circle. The tracker + # is still coasting that player (Kalman keep-alive), so draw any + # confirmed track that has no detection this frame -- detections + # still drive everything they cover; tracks only fill the gaps. + if track_entries: + covered = set() + for d in det_entries: + if _is_ball(d["label"]): + continue + for t in track_entries: + if t["track_id"] in covered or _is_ball(t["label"]): + continue + if self._overlap(d["box"], t["box"]) >= 0.3: + covered.add(t["track_id"]) + draw_entries += [ + t + for t in track_entries + if not _is_ball(t["label"]) and t["track_id"] not in covered + ] + else: + draw_entries = track_entries if track_entries else det_entries + # min-confidence gates only what we *draw* (tracks carry conf 1.0, so + # they're unaffected); the contact math above used the raw detections. + if self.min_confidence > 0.0: + draw_entries = [ + e for e in draw_entries if e["confidence"] >= self.min_confidence + ] + # Collapse overlapping boxes so one player isn't circled twice, + # then low-pass the positions so the circle glides. + draw_entries = self._merge_overlaps(draw_entries) + draw_entries = self._smooth_boxes(np, draw_entries) + if not all_entries: + return Gst.FlowReturn.OK + + import cv2 + + ok, mapinfo = buf.map(Gst.MapFlags.WRITE) + if not ok: + self.logger.error("Failed to map buffer for writing") + return Gst.FlowReturn.ERROR + try: + frame = np.frombuffer( + mapinfo.data, dtype=np.uint8, count=self.height * self.width * 4 + ).reshape(self.height, self.width, 4) + + # Jersey team voting first, so trails/ellipses use this frame's + # vote (track mode; detection mode classifies per box at draw). + # Referees are voted on too -- their colour comes from the jersey + # (gold only as the fallback), not the class label. + if self.team_colors: + for e in track_entries: + tid = e["track_id"] + lab = self._stable_label(tid, e["label"]) + if _is_ball(lab): + continue + vote = self._classify_jersey(cv2, np, frame, e["box"]) + if vote: + tv = self._team_votes.setdefault( + tid, {"red": 0, "blue": 0, "ref": 0} + ) + tv[vote] = tv.get(vote, 0) + 1 + + if self.trails: + for tid in active: + rgba = self._color_for(self._track_label.get(tid, ""), tid) + if rgba is None: + continue + self._draw_trail(cv2, np, frame, self._trail.get(tid, []), rgba) + + # Which drawn box is the focal (HUD) player? Match the focal + # track's box to the nearest drawn box so we can highlight it + # even when drawing from detections (no track id on the box). + focal_idx = None + if self.highlight_focal: + focal_tid = self._focal_track() + focal_box = None + if focal_tid is not None: + for t in track_entries: + if t["track_id"] == focal_tid: + focal_box = t["box"] + break + if focal_box is not None: + best = 0.0 + for i, e in enumerate(draw_entries): + if _is_ball(e["label"]): + continue + ov = self._overlap(e["box"], focal_box) + if ov > best: + best, focal_idx = ov, i + + # Stable track id per drawn box (detection boxes borrow the id of + # the track they overlap) -- used for the id badge and to look up + # the track's accumulated colour. + draw_ids = self._assign_track_ids(draw_entries, track_entries) + + for i, e in enumerate(draw_entries): + box = e["box"] + badge_id = draw_ids[i] + # Use the track's stable identity (class + accumulated team + # votes) for colour whenever the box maps to a track -- in + # detection mode that's the box's matched track id. This + # makes colour robust to per-frame label/jersey noise. Only + # an unmatched detection falls back to this frame's guess. + color_tid = e["track_id"] if e["track_id"] is not None else badge_id + if color_tid is not None: + label = self._stable_label(color_tid, e["label"]) + else: + label = e["label"] + if _is_ball(label): + if self.show_ball: + self._draw_triangle(cv2, np, frame, box, _BALL_RGBA) + continue + if color_tid is not None: + rgba = self._color_for(label, color_tid) + else: + rgba = self._detection_color(cv2, np, frame, label, box) + if rgba is None: + continue + self._draw_ellipse(cv2, frame, box, rgba, badge_id) + if i == focal_idx: + self._draw_focal_marker(cv2, np, frame, box) + if self.show_labels: + self._draw_label(cv2, frame, box, label, rgba) + + if self.show_hud: + focal = self._focal_track() + if focal is not None: + ppm = self._px_per_meter() + dist_m = ( + (self._distance_px.get(focal, 0.0) / ppm) if ppm else 0.0 + ) + hud_rgba = ( + self._color_for(self._track_label.get(focal, ""), focal) + or _DEFAULT_RGBA + ) + self._draw_hud( + cv2, + frame, + self._contacts.get(focal, 0), + dist_m, + hud_rgba, + self._load_headshot(cv2, np), + ) + finally: + buf.unmap(mapinfo) + + return Gst.FlowReturn.OK + + except Exception as e: + self.logger.error(f"FootballOverlay transform error: {e}") + return Gst.FlowReturn.ERROR + + +if CAN_REGISTER_ELEMENT: + GObject.type_register(FootballOverlay) + __gstelementfactory__ = ( + "pyml_football_overlay", + Gst.Rank.NONE, + FootballOverlay, + ) +else: + GlobalLogger().warning( + "The 'pyml_football_overlay' element will not be registered because " + "required modules are missing." + ) diff --git a/plugins/python/objectdetector.py b/plugins/python/objectdetector.py index a429eb5..9272295 100644 --- a/plugins/python/objectdetector.py +++ b/plugins/python/objectdetector.py @@ -46,12 +46,40 @@ class ObjectDetector(BaseObjectDetector): "Aaron Boxer ", ) + confidence = GObject.Property( + type=float, + default=0.25, + minimum=0.0, + maximum=1.0, + nick="Confidence Threshold", + blurb="Minimum detection confidence for the decoder post-process " + "(anchor_free); lower = more (and weaker) detections", + flags=GObject.ParamFlags.READWRITE, + ) + nms_iou = GObject.Property( + type=float, + default=0.45, + minimum=0.0, + maximum=1.0, + nick="NMS IoU", + blurb="NMS IoU threshold for the decoder post-process; higher keeps " + "more overlapping boxes", + flags=GObject.ParamFlags.READWRITE, + ) + def __init__(self): super().__init__() self.logger.info( "ObjectDetector created without a model. Please set the 'model-name' property." ) + def do_forward(self, frames): + # Push decoder thresholds to the engine before it post-processes. + if self.engine: + self.engine.conf = self.confidence + self.engine.iou = self.nms_iou + return super().do_forward(frames) + if CAN_REGISTER_ELEMENT: GObject.type_register(ObjectDetector) diff --git a/plugins/python/tracker.py b/plugins/python/tracker.py index ad0db4d..7ff3520 100644 --- a/plugins/python/tracker.py +++ b/plugins/python/tracker.py @@ -130,12 +130,137 @@ def iou_batch(bb_det, bb_trk): class SortTracker: """SORT/ByteTrack multi-object tracker using IoU + Kalman filtering.""" - def __init__(self, max_age=30, min_hits=3, iou_threshold=0.3): + def __init__( + self, + max_age=30, + min_hits=3, + iou_threshold=0.3, + keep_alive=2, + new_track_conf=0.25, + camera_motion=True, + dup_iou=0.8, + ): self.max_age = max_age self.min_hits = min_hits self.iou_threshold = iou_threshold + # ByteTrack-style activation gate: a brand-new track is only started + # from a confident detection. Weak/ghost boxes can still *continue* an + # existing track (matched above) but won't spawn phantom circles. + self.new_track_conf = new_track_conf + # Keep emitting a confirmed track (with its Kalman-predicted box) for up + # to keep_alive frames after a missed detection — bridges flicker so the + # overlay doesn't blink when the detector drops a box for a frame or two. + self.keep_alive = keep_alive + # Camera-motion compensation: estimate the global image shift from the + # tracks that matched, then re-try matching the leftovers with their + # predictions shifted by it. Re-attaches players during a pan instead of + # leaving the old track behind and spawning a duplicate. + self.camera_motion = camera_motion + # Two confirmed tracks overlapping more than this IoU are duplicates; + # the weaker one is dropped (ByteTrack's remove_duplicate_stracks). + self.dup_iou = dup_iou self.trackers = [] + @staticmethod + def _center(bbox): + return (bbox[0] + bbox[2] / 2.0, bbox[1] + bbox[3] / 2.0) + + def _estimate_motion(self, matches, predicted, det_bboxes): + """Fit a global 2D similarity transform (translation + uniform scale + + rotation) mapping each matched track's predicted centre to its observed + centre. Returns a callable box->warped-box, or None if it can't be + estimated. Uses RANSAC (via OpenCV) so players moving against the camera + consensus are rejected as outliers; falls back to a robust median + translation if OpenCV is unavailable or the fit is degenerate.""" + import numpy as np + + if len(matches) < 3: + return None + src = np.array( + [self._center(predicted[ti]) for _, ti in matches], dtype=np.float32 + ) + dst = np.array( + [self._center(det_bboxes[di]) for di, _ in matches], dtype=np.float32 + ) + + M = None + try: + import cv2 + + M, _ = cv2.estimateAffinePartial2D( + src, dst, method=cv2.RANSAC, ransacReprojThreshold=5.0 + ) + except Exception: + M = None + + if M is not None: + scale = float(np.hypot(M[0, 0], M[0, 1])) + # Reject implausible fits (e.g. from too few/noisy correspondences). + if 0.5 <= scale <= 2.0: + + def warp(box): + cx, cy = self._center(box) + ncx = M[0, 0] * cx + M[0, 1] * cy + M[0, 2] + ncy = M[1, 0] * cx + M[1, 1] * cy + M[1, 2] + nw, nh = box[2] * scale, box[3] * scale + return np.array([ncx - nw / 2.0, ncy - nh / 2.0, nw, nh]) + + return warp + + # Fallback: robust median translation (pan/tilt only). + delta = dst - src + tx, ty = float(np.median(delta[:, 0])), float(np.median(delta[:, 1])) + if abs(tx) < 1.0 and abs(ty) < 1.0: + return None + return lambda box: np.array([box[0] + tx, box[1] + ty, box[2], box[3]]) + + def _associate(self, det_bboxes, det_idxs, trk_idxs, trk_boxes, detections): + """Hungarian-match a subset of detections to a subset of trackers, + applying updates to matched trackers. Returns list of (det_i, trk_i).""" + from scipy.optimize import linear_sum_assignment + + if not det_idxs or not trk_idxs: + return [] + dets = [det_bboxes[d] for d in det_idxs] + trks = [trk_boxes[t] for t in trk_idxs] + iou_matrix = iou_batch(dets, trks) + if iou_matrix.size == 0: + return [] + cost = 1.0 - iou_matrix + row_ind, col_ind = linear_sum_assignment(cost) + matches = [] + for r, c in zip(row_ind, col_ind): + if iou_matrix[r, c] >= self.iou_threshold: + di, ti = det_idxs[r], trk_idxs[c] + self.trackers[ti].update(detections[di][:4]) + self.trackers[ti].label_quark = detections[di][5] + matches.append((di, ti)) + return matches + + def _suppress_duplicates(self): + """Drop the weaker of any two confirmed tracks sitting on the same box.""" + n = len(self.trackers) + if n < 2: + return + boxes = [t.get_bbox() for t in self.trackers] + iou_matrix = iou_batch(boxes, boxes) + remove = set() + for i in range(n): + if i in remove: + continue + for j in range(i + 1, n): + if j in remove: + continue + if iou_matrix[i, j] > self.dup_iou: + ti, tj = self.trackers[i], self.trackers[j] + # Keep the better track: matched more recently, then more + # hits; drop the other (usually the freshly-spawned dup). + ki = (ti.time_since_update, -ti.hits) + kj = (tj.time_since_update, -tj.hits) + remove.add(j if ki <= kj else i) + if remove: + self.trackers = [t for k, t in enumerate(self.trackers) if k not in remove] + def update(self, detections): """ Update tracks with new detections. @@ -147,54 +272,66 @@ def update(self, detections): list of (track_id, bbox, label_quark) for confirmed tracks """ import numpy as np - from scipy.optimize import linear_sum_assignment # Predict new locations for existing tracks - predicted = [] to_remove = [] for i, trk in enumerate(self.trackers): - pred = trk.predict() - if np.any(np.isnan(pred)): + if np.any(np.isnan(trk.predict())): to_remove.append(i) - else: - predicted.append(pred) for i in reversed(to_remove): self.trackers.pop(i) - # Build cost matrix using IoU det_bboxes = [d[:4] for d in detections] if len(detections) > 0 else [] - iou_matrix = iou_batch(det_bboxes, predicted) - cost_matrix = 1.0 - iou_matrix - - # Hungarian assignment - matched_det = set() - matched_trk = set() - if cost_matrix.size > 0: - row_ind, col_ind = linear_sum_assignment(cost_matrix) - for r, c in zip(row_ind, col_ind): - if iou_matrix[r, c] >= self.iou_threshold: - matched_det.add(r) - matched_trk.add(c) - self.trackers[c].update(detections[r][:4]) - # Store latest label quark on tracker - self.trackers[c].label_quark = detections[r][5] - - # Create new tracks for unmatched detections - for d_idx in range(len(detections)): + n_det = len(det_bboxes) + n_trk = len(self.trackers) + # Predicted box per tracker, captured before any update this frame. + predicted = [self.trackers[i].get_bbox() for i in range(n_trk)] + + # 1) First association on the raw predictions. + matches = self._associate( + det_bboxes, list(range(n_det)), list(range(n_trk)), predicted, detections + ) + matched_det = {di for di, _ in matches} + matched_trk = {ti for _, ti in matches} + + # 2) Camera-motion compensation: fit a global image transform (pan, zoom + # and rotation) from the tracks that matched, apply it to the leftover + # predictions, and re-match. This recovers tracks during camera moves + # instead of leaving them behind and spawning duplicates. + if self.camera_motion: + warp = self._estimate_motion(matches, predicted, det_bboxes) + if warp is not None: + rem_trk = [i for i in range(n_trk) if i not in matched_trk] + rem_det = [d for d in range(n_det) if d not in matched_det] + if rem_trk and rem_det: + shifted = {i: warp(predicted[i]) for i in rem_trk} + m2 = self._associate( + det_bboxes, rem_det, rem_trk, shifted, detections + ) + matched_det.update(di for di, _ in m2) + + # Create new tracks for unmatched detections, but only from confident + # ones (ByteTrack activation gate) so weak/ghost boxes don't start a + # phantom track that gets drawn as a stray circle. + for d_idx in range(n_det): if d_idx not in matched_det: + if detections[d_idx][4] < self.new_track_conf: + continue trk = KalmanBoxTracker(detections[d_idx][:4]) trk.label_quark = detections[d_idx][5] self.trackers.append(trk) - # Remove dead tracks + # Remove dead tracks, then drop duplicate tracks sitting on one object. self.trackers = [ t for t in self.trackers if t.time_since_update <= self.max_age ] + self._suppress_duplicates() - # Return confirmed tracks + # Return confirmed tracks, including ones that missed a detection this + # frame (predicted box) for up to keep_alive frames — prevents flicker. results = [] for trk in self.trackers: - if trk.hits >= self.min_hits and trk.time_since_update == 0: + if trk.hits >= self.min_hits and trk.time_since_update <= self.keep_alive: results.append((trk.id, trk.get_bbox(), trk.label_quark)) return results @@ -268,6 +405,50 @@ class TrackerTransform(GstBase.BaseTransform): flags=GObject.ParamFlags.READWRITE, ) + keep_alive = GObject.Property( + type=int, + default=2, + minimum=0, + maximum=1000, + nick="Keep Alive", + blurb="Frames to keep emitting a confirmed track (Kalman-predicted box) " + "after a missed detection; bridges flicker (0 = only matched frames)", + flags=GObject.ParamFlags.READWRITE, + ) + + new_track_confidence = GObject.Property( + type=float, + default=0.25, + minimum=0.0, + maximum=1.0, + nick="New Track Confidence", + blurb="Minimum detection confidence to START a new track (ByteTrack " + "activation gate); weak boxes still continue existing tracks but " + "won't spawn phantom/duplicate circles", + flags=GObject.ParamFlags.READWRITE, + ) + + camera_motion = GObject.Property( + type=bool, + default=True, + nick="Camera Motion Compensation", + blurb="Estimate the global image shift from matched tracks and re-match " + "leftovers shifted by it, so a panning camera re-attaches players " + "instead of leaving the old track behind and spawning a duplicate", + flags=GObject.ParamFlags.READWRITE, + ) + + duplicate_iou = GObject.Property( + type=float, + default=0.8, + minimum=0.0, + maximum=1.0, + nick="Duplicate IoU", + blurb="Two confirmed tracks overlapping more than this are treated as " + "duplicates and the weaker one is dropped", + flags=GObject.ParamFlags.READWRITE, + ) + def __init__(self): super().__init__() self.logger = LoggerFactory.get(LoggerFactory.LOGGER_TYPE_GST) @@ -281,6 +462,10 @@ def _ensure_tracker(self): max_age=self.max_age, min_hits=self.min_hits, iou_threshold=self.iou_threshold, + keep_alive=self.keep_alive, + new_track_conf=self.new_track_confidence, + camera_motion=self.camera_motion, + dup_iou=self.duplicate_iou, ) return self._tracker @@ -351,6 +536,14 @@ def do_get_property(self, prop): return self.min_hits elif prop.name == "iou-threshold": return self.iou_threshold + elif prop.name == "keep-alive": + return self.keep_alive + elif prop.name == "new-track-confidence": + return self.new_track_confidence + elif prop.name == "camera-motion": + return self.camera_motion + elif prop.name == "duplicate-iou": + return self.duplicate_iou else: raise AttributeError(f"Unknown property {prop.name}") @@ -367,6 +560,18 @@ def do_set_property(self, prop, value): elif prop.name == "iou-threshold": self.iou_threshold = value self._tracker = None + elif prop.name == "keep-alive": + self.keep_alive = value + self._tracker = None + elif prop.name == "new-track-confidence": + self.new_track_confidence = value + self._tracker = None + elif prop.name == "camera-motion": + self.camera_motion = value + self._tracker = None + elif prop.name == "duplicate-iou": + self.duplicate_iou = value + self._tracker = None else: raise AttributeError(f"Unknown property {prop.name}") diff --git a/plugins/python/yolo.py b/plugins/python/yolo.py index a34deb6..2140c74 100644 --- a/plugins/python/yolo.py +++ b/plugins/python/yolo.py @@ -160,6 +160,9 @@ def do_forward(self, frames): ) end_pre = time.time() + conf = getattr(self, "conf", 0.25) + iou = getattr(self, "iou", 0.5) + agnostic = getattr(self, "agnostic_nms", True) if self.track: # Ensure tracker persists across batches results = self.execute_with_stream( @@ -167,14 +170,23 @@ def do_forward(self, frames): source=img_list, persist=True, imgsz=640, - conf=0.1, + conf=conf, + iou=iou, + agnostic_nms=agnostic, verbose=True, tracker="botsort.yaml", ) ) else: results = self.execute_with_stream( - lambda: model(img_list, imgsz=640, conf=0.1, verbose=True) + lambda: model( + img_list, + imgsz=640, + conf=conf, + iou=iou, + agnostic_nms=agnostic, + verbose=True, + ) ) end_inf = time.time() @@ -205,6 +217,36 @@ class YOLOTransform(BaseObjectDetector): "Aaron Boxer ", ) + confidence = GObject.Property( + type=float, + default=0.1, + minimum=0.0, + maximum=1.0, + nick="Confidence Threshold", + blurb="Minimum detection confidence (matches football_analyzer); kept " + "low on purpose so the tracker can use weak boxes to continue tracks " + "-- the tracker's new-track-confidence gates phantom tracks", + flags=GObject.ParamFlags.READWRITE, + ) + nms_iou = GObject.Property( + type=float, + default=0.7, + minimum=0.0, + maximum=1.0, + nick="NMS IoU", + blurb="NMS IoU threshold (matches football_analyzer's default); lower " + "suppresses more overlap but can also drop genuinely close players", + flags=GObject.ParamFlags.READWRITE, + ) + agnostic_nms = GObject.Property( + type=bool, + default=False, + nick="Class-Agnostic NMS", + blurb="Suppress overlapping boxes across classes too; off by default " + "(like football_analyzer) so two close players aren't merged", + flags=GObject.ParamFlags.READWRITE, + ) + def __init__(self): super().__init__() self.mgr.engine_name = "pyml_yolo_engine" @@ -222,6 +264,14 @@ def engine_name(self, value): "The 'engine_name' property cannot be set in this derived class." ) + def do_forward(self, frames): + # Push NMS/confidence knobs to the engine before it runs the model. + if self.engine: + self.engine.conf = self.confidence + self.engine.iou = self.nms_iou + self.engine.agnostic_nms = self.agnostic_nms + return super().do_forward(frames) + def do_decode(self, buf, result, stream_idx=0): self.logger.debug( f"Decoding YOLO result for buffer {hex(id(buf))}, stream {stream_idx}: {result}" @@ -250,7 +300,9 @@ def do_decode(self, buf, result, stream_idx=0): score = boxes.conf[i] label = boxes.cls[i] label_num = label.item() - class_name = COCO_CLASSES.get(label_num, f"unknown_{label_num}") + # Prefer the model's own class names; fall back to COCO for plain yolo. + names = getattr(result, "names", None) or COCO_CLASSES + class_name = names.get(label_num, f"unknown_{label_num}") # Use class name for detection, track_id for tracking if self.engine.track and hasattr(boxes, "id") and boxes.id is not None: diff --git a/rzv2h/CMakeLists.txt b/rzv2h/CMakeLists.txt new file mode 100644 index 0000000..5f09e2f --- /dev/null +++ b/rzv2h/CMakeLists.txt @@ -0,0 +1,54 @@ +# Build the `drpai_runtime` Python extension for RZ/V2H. +# +# This mirrors the SDK's apps/CMakeLists.txt (same TVM includes, same V2H +# runtime libraries) but produces a Python module instead of an executable. +# It MUST be configured with the SDK cross-toolchain and built inside the +# RZ/V2H DRP-AI TVM SDK Docker. See README.md. +# +# Required env: TVM_ROOT (root of rzv_drp-ai_tvm), SDK (Yocto cross SDK) +# Required -D : PYBIND11_INCLUDE_DIR, PYTHON_INCLUDE_DIR (target aarch64 python) +cmake_minimum_required(VERSION 3.16) +project(drpai_runtime CXX) +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +if(NOT DEFINED ENV{TVM_ROOT}) + message(FATAL_ERROR "TVM_ROOT not set — source the DRP-AI TVM SDK env first") +endif() +set(TVM_ROOT "$ENV{TVM_ROOT}") + +set(DRPAI_APPS "${TVM_ROOT}/apps" CACHE PATH "rzv_drp-ai_tvm/apps directory") +set(PYBIND11_INCLUDE_DIR "" CACHE PATH "pybind11 include directory") +set(PYTHON_INCLUDE_DIR "" CACHE PATH "target python3 include dir") +set(LIBMERA_RT_PATH ${TVM_ROOT}/obj/build_runtime/v2h/lib) + +add_library(drpai_runtime MODULE + drpai_runtime_pybind.cpp + ${DRPAI_APPS}/MeraDrpRuntimeWrapper.cpp +) +set_target_properties(drpai_runtime PROPERTIES PREFIX "" SUFFIX ".so") + +target_include_directories(drpai_runtime PRIVATE + ${DRPAI_APPS} + ${TVM_ROOT}/tvm/include + ${TVM_ROOT}/setup/include + ${TVM_ROOT}/tvm/3rdparty/dlpack/include + ${TVM_ROOT}/tvm/3rdparty/dmlc-core/include + ${TVM_ROOT}/tvm/3rdparty/compiler-rt + ${PYBIND11_INCLUDE_DIR} + ${PYTHON_INCLUDE_DIR} +) + +add_definitions(-DMERA_DRP_RUNTIME) +target_compile_definitions(drpai_runtime PUBLIC KDLDRPAI) +target_link_directories(drpai_runtime PRIVATE ${LIBMERA_RT_PATH}) +target_link_libraries(drpai_runtime PRIVATE + mera2_runtime + mera2_plan_io + drp_tvm_rt + pthread +) +set_target_properties(drpai_runtime PROPERTIES + LINK_FLAGS "-Wl,-rpath,${LIBMERA_RT_PATH} -Wl,-rpath-link,${LIBMERA_RT_PATH}") + +target_compile_options(drpai_runtime PRIVATE -O3 -mtune=cortex-a55 -Wall -fvisibility=hidden) diff --git a/rzv2h/README.md b/rzv2h/README.md new file mode 100644 index 0000000..de8d0bc --- /dev/null +++ b/rzv2h/README.md @@ -0,0 +1,80 @@ +# Object detection on Renesas RZ/V2H (DRP-AI NPU) + +This runs `pyml_objectdetector` on the **RZ/V2H** DRP-AI NPU, using a YOLO11 +model compiled with the **DRP-AI TVM** compiler (powered by EdgeCortix MERA). + +It is the decomposed, metadata-passing pipeline used elsewhere in this repo — +detector -> (tracker) -> overlay, but the detector's inference runs on the NPU: + +``` +... ! pyml_objectdetector engine-name=drpai model-name= device=drpai + input-format=nchw post-process=anchor_free + ! pyml_tracker ! pyml_overlay ! ... +``` +## Prerequisites + +- RZ/V2H EVK with the **RZ/V2H AI SDK v6.00** Yocto image (provides the DRP-AI + driver, `/dev/drpai0`, GStreamer, and Python 3). +- The **DRP-AI TVM** package (`rzv_drp-ai_tvm`) and its SDK Docker, with the + environment sourced so `TVM_ROOT`, `SDK` (cross SDK), and the DRP-AI + translator are set. (`PRODUCT=V2H`.) +- `pybind11` headers available to the cross build. + +## 1 — Convert the model (in the SDK Docker) + +```bash +./convert_yolo11_v2h.sh yolo11m 640 +``` + +This exports YOLO11->ONNX (input node `images`, `1x3x640x640`) and runs the V2H +DRP-AI TVM compiler. See the script for the exact commands. + +## 2 — Build the Python binding (in the SDK Docker) + +Source the SDK env first (so `TVM_ROOT`/`SDK` are set and CXX is the aarch64 +cross compiler), then: + +```bash +cd rzv2h +cmake -B build \ + -DCMAKE_TOOLCHAIN_FILE="$TVM_ROOT/apps/toolchain/runtime.cmake" \ + -DPYBIND11_INCLUDE_DIR="$(python3 -m pybind11 --includes | sed 's/-I//;q')" \ + -DPYTHON_INCLUDE_DIR="$SDK/sysroots/aarch64-poky-linux/usr/include/python3.12" +cmake --build build -j +``` + +Adjust `python3.12` to the AI SDK image's Python version, and point +`PYBIND11_INCLUDE_DIR` at a real pybind11 headers dir if the one-liner doesn't +resolve in the container. + +## 3 — Deploy to the board + +Copy onto the RZ/V2H (e.g. under `/home/weston`): + +- this repo's `plugins/` (the gst-python-ml elements), +- `build/drpai_runtime.so`, +- the compiled `yolo11m_drpai_v2h/` deploy dir, +- a COCO label file if you overlay class names. + +```bash +export GST_PLUGIN_PATH=/home/weston/gst-python-ml/plugins:$GST_PLUGIN_PATH +export PYTHONPATH=/home/weston/rzv2h/build:$PYTHONPATH +gst-inspect-1.0 pyml_objectdetector +``` + +## 4 — Run on the board + +File -> annotated file (run as a user that can open `/dev/drpai0`, often root): + +```bash +gst-launch-1.0 filesrc location=clip.mp4 ! decodebin ! videoconvert ! videoscale \ + ! "video/x-raw,format=RGB,width=640,height=640" \ + ! pyml_objectdetector engine-name=drpai model-name=yolo11m_drpai_v2h device=drpai \ + input-format=nchw post-process=anchor_free \ + ! pyml_tracker tracker-type=bytetrack \ + ! videoconvert ! "video/x-raw,format=RGBA" ! pyml_overlay \ + ! videoconvert ! autovideosink +``` + +Live camera (MIPI/USB): swap `filesrc ! decodebin` for the camera source +(`v4l2src` / the EVK's ISP source), keeping the `640x640` caps into the detector. diff --git a/rzv2h/convert_yolo11_v2h.sh b/rzv2h/convert_yolo11_v2h.sh new file mode 100755 index 0000000..06d8412 --- /dev/null +++ b/rzv2h/convert_yolo11_v2h.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash +# Compile YOLO11 (ONNX) -> RZ/V2H DRP-AI (INT8) deploy dir, using the REAL +# mera2 + DRP-AI Translator i8 + DRP-AI Quantizer flow. +# +# RUN INSIDE the drpai-tvm-v2h container (built via rzv2h/sdk_eval/build_image.sh), +# with this repo mounted at /work. RZ/V2H uses the DRP-AI INT8 accelerator, so +# quantization is MANDATORY and calibration images are required — this is why +# the plain FP compile_onnx_model.py does NOT work for V2H. +# +# Usage (inside container): +# ./rzv2h/convert_yolo11_v2h.sh [MODEL.onnx] [OUT_DIR] [CALIB_DIR] [IMGSZ] +# Defaults assume the repo is at /work and the ONNX is exported already +# (e.g. `yolo export model=models/yolo11m/yolo11m.pt format=onnx imgsz=640` on a +# host with ultralytics — the container has no ultralytics). +set -euo pipefail + +ONNX="${1:-/work/models/yolo11m/yolo11m.onnx}" +OUT="${2:-/work/rzv2h/yolo11m_drpai_v2h}" +CALIB="${3:-/work/rzv2h/calib}" +IMGSZ="${4:-640}" + +: "${TVM_ROOT:?run inside the drpai-tvm-v2h container (TVM_ROOT unset)}" +export PRODUCT=V2H +export SDK="$(find /opt/ -name sysroots -type d | head -1)/../" +export TRANSLATOR="$(find /opt/ -name python_api -type d | head -1)/../../" +: "${QUANTIZER:?QUANTIZER env not set (expected from the image)}" +export PATH="$TVM_ROOT/tutorials:$PATH" # so run_drp_compiler.sh resolves +chmod +x "$TVM_ROOT"/tutorials/*.sh 2>/dev/null || true # SDK ships them non-+x + +[[ -f "$ONNX" ]] || { echo "ONNX not found: $ONNX (export it first)"; exit 1; } +[[ -d "$CALIB" ]] || { echo "calibration image dir not found: $CALIB"; exit 1; } + +# The stock quant script preprocesses calibration images as ImageNet (224 + +# mean/std) — wrong for YOLO (needs IMGSZ, /255, RGB, CHW). Patch that one line. +python3 - "$TVM_ROOT/tutorials/compile_onnx_model_quant.py" "$IMGSZ" <<'PYEOF' +import sys +p, sz = sys.argv[1], int(sys.argv[2]) +s = open(p).read() +old = "input_data = pre_process_imagenet_pytorch(image, mean, stdev, need_transpose=True)" +new = ("input_data = (cv2.resize(image,(%d,%d))[:,:,::-1]" + ".astype('float32')/255.0).transpose(2,0,1)" % (sz, sz)) +if old in s: + open(p, "w").write(s.replace(old, new)); print("[patch] calibration preprocessing ->", sz) +else: + print("[patch] calibration line already patched / not found") +PYEOF + +rm -rf "$OUT" +cd "$TVM_ROOT/tutorials" +python3 compile_onnx_model_quant.py "$ONNX" \ + -o "$OUT" -i images -s "1,3,${IMGSZ},${IMGSZ}" \ + -t "$SDK" -d "$TRANSLATOR" -c "$QUANTIZER" --images "$CALIB" + +echo +echo "Done. RZ/V2H DRP-AI (INT8) deploy dir: $OUT" +echo " sub_0000__CPU_DRP_TVM/{deploy.so,deploy.json,deploy.params} (aarch64 + DRP-AI)" +echo " preprocess/ (DRP-AI pre-processing runtime objects)" +echo "Copy $OUT to the board; load sub_0000__CPU_DRP_TVM with the MERA runtime." diff --git a/rzv2h/drpai_runtime_pybind.cpp b/rzv2h/drpai_runtime_pybind.cpp new file mode 100644 index 0000000..5d2d0b9 --- /dev/null +++ b/rzv2h/drpai_runtime_pybind.cpp @@ -0,0 +1,149 @@ +// drpai_runtime_pybind.cpp +// Copyright (C) 2024-2026 Collabora Ltd. — LGPL (see COPYING). +// +// pybind11 binding around the Renesas DRP-AI TVM runtime +// (MeraDrpRuntimeWrapper, powered by EdgeCortix MERA(TM)) for RZ/V2H. +// +// Exposes a minimal `drpai_runtime.Runtime` class to Python so the pure-Python +// `drpai_engine.py` can drive the DRP-AI NPU: +// +// import drpai_runtime +// rt = drpai_runtime.Runtime() +// rt.load("/path/to/deploy_dir") # deploy.so/json/params +// rt.set_input(0, nchw_float32_numpy) +// rt.run() +// out0 = rt.get_output(0) # numpy (float32, fp16 upcast) +// +// Build with CMake against the board's DRP-AI TVM runtime — see CMakeLists.txt +// and README.md. This compiles only inside the RZ/V2H DRP-AI TVM SDK and runs +// only on the board (it talks to /dev/drpai0). + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "MeraDrpRuntimeWrapper.h" + +namespace py = pybind11; + +static float fp16_to_fp32(uint16_t h) { + uint32_t sign = static_cast(h & 0x8000) << 16; + uint32_t exp = (h >> 10) & 0x1F; + uint32_t mant = h & 0x3FF; + uint32_t f; + if (exp == 0) { + if (mant == 0) { + f = sign; + } else { + exp = 127 - 15 + 1; + while ((mant & 0x400) == 0) { + mant <<= 1; + exp--; + } + mant &= 0x3FF; + f = sign | (exp << 23) | (mant << 13); + } + } else if (exp == 0x1F) { + f = sign | 0x7F800000 | (mant << 13); // Inf / NaN + } else { + f = sign | ((exp - 15 + 127) << 23) | (mant << 13); + } + float out; + std::memcpy(&out, &f, sizeof(out)); + return out; +} + +static uint64_t get_drpai_start_addr() { + int fd = open("/dev/drpai0", O_RDWR); + if (fd < 0) { + throw std::runtime_error("Failed to open /dev/drpai0 (run on the board, as root?)"); + } + drpai_data_t drpai_data; + int ret = ioctl(fd, DRPAI_GET_DRPAI_AREA, &drpai_data); + close(fd); + if (ret == -1) { + throw std::runtime_error("ioctl(DRPAI_GET_DRPAI_AREA) failed"); + } + return drpai_data.address; +} + +class Runtime { + public: + Runtime() : rt_() {} + + bool load(const std::string& model_dir) { + model_dir_ = model_dir; + return rt_.LoadModel(model_dir, get_drpai_start_addr()); + } + + void set_input(int index, + py::array_t data) { + rt_.SetInput(index, static_cast(data.data())); + } + + void run() { rt_.Run(); } + + int num_input() { return rt_.GetNumInput(model_dir_); } + int num_output() { return rt_.GetNumOutput(); } + + py::array get_output(int index) { + auto out = rt_.GetOutput(index); + InOutDataType dtype = std::get<0>(out); + const void* ptr = std::get<1>(out); + int64_t size = std::get<2>(out); + + switch (dtype) { + case InOutDataType::FLOAT16: { + const uint16_t* src = reinterpret_cast(ptr); + py::array_t result(size); + float* dst = static_cast(result.request().ptr); + for (int64_t i = 0; i < size; ++i) dst[i] = fp16_to_fp32(src[i]); + return result; + } + case InOutDataType::FLOAT32: { + py::array_t result(size); + std::memcpy(result.request().ptr, ptr, size * sizeof(float)); + return result; + } + case InOutDataType::INT32: { + py::array_t result(size); + std::memcpy(result.request().ptr, ptr, size * sizeof(int32_t)); + return result; + } + case InOutDataType::INT64: { + py::array_t result(size); + std::memcpy(result.request().ptr, ptr, size * sizeof(int64_t)); + return result; + } + default: + throw std::runtime_error("Unsupported DRP-AI output data type"); + } + } + + private: + MeraDrpRuntimeWrapper rt_; + std::string model_dir_; +}; + +PYBIND11_MODULE(drpai_runtime, m) { + m.doc() = "pybind11 binding for the Renesas DRP-AI TVM runtime (RZ/V2H)"; + py::class_(m, "Runtime") + .def(py::init<>()) + .def("load", &Runtime::load, py::arg("model_dir"), + "Load a DRP-AI TVM deploy directory (deploy.so/json/params).") + .def("set_input", &Runtime::set_input, py::arg("index"), py::arg("data")) + .def("run", &Runtime::run) + .def("num_input", &Runtime::num_input) + .def("num_output", &Runtime::num_output) + .def("get_output", &Runtime::get_output, py::arg("index")); +} diff --git a/rzv2h/emulation/drpai_runtime.py b/rzv2h/emulation/drpai_runtime.py new file mode 100644 index 0000000..6fdbe25 --- /dev/null +++ b/rzv2h/emulation/drpai_runtime.py @@ -0,0 +1,123 @@ +# drpai_runtime.py — off-board stand-in for the native pybind `drpai_runtime`. +# Copyright (C) 2024-2026 Collabora Ltd. — LGPL (see COPYING). +# +# Same interface as the C++ binding (Runtime.load / set_input / run / +# num_output / get_output), with two backends auto-selected by what's in the +# model directory and what's importable: +# +# 1. MERA / TVM graph_executor — if the dir has deploy.so/json/params AND a +# `tvm` runtime is importable (i.e. inside the Renesas DRP-AI TVM SDK +# container, or on the board). This runs the REAL MERA/TVM runtime — the +# faithful "test through the TVM runtime". On the board the deploy.so runs +# on the DRP-AI NPU / Arm CPU; in the SDK container it runs on whatever the +# module was compiled for (aarch64 needs QEMU; an x86-target build runs +# natively for functional check). +# +# 2. ONNX Runtime (CPU) — fallback look-alike for plain x86 dev boxes +# with no SDK: runs the same yolo11m.onnx that feeds the DRP-AI compiler so +# the engine's preprocess/reshape/decode path is exercised. Validates our +# code, NOT the DRP-AI/MERA runtime. +# +# get_output() always returns a FLAT array, matching the C++ GetOutput buffer, +# so the engine's reshape-to-(1, 4+nc, anchors) path is genuinely tested. + +import glob +import os + +import numpy as np + + +class Runtime: + def __init__(self): + self._backend = None + # tvm backend + self._mod = None + self._dev = None + self._input_name = os.getenv("DRPAI_INPUT_NAME", "images") + # onnx backend + self._sess = None + self._ort_input = None + self._feed = None + self._outputs = None + + def load(self, model_dir): + deploy_so = os.path.join(model_dir, "deploy.so") + if os.path.isfile(deploy_so) and self._try_load_tvm(model_dir, deploy_so): + return True + return self._try_load_onnx(model_dir) + + # ---- backend 1: real MERA / TVM graph_executor ---- + def _try_load_tvm(self, model_dir, deploy_so): + try: + import tvm + from tvm.contrib import graph_executor + except ImportError: + return False + try: + lib = tvm.runtime.load_module(deploy_so) + with open(os.path.join(model_dir, "deploy.json")) as f: + graph = f.read() + self._dev = tvm.cpu(0) + self._mod = graph_executor.create(graph, lib, self._dev) + with open(os.path.join(model_dir, "deploy.params"), "rb") as f: + self._mod.load_params(bytearray(f.read())) + self._backend = "tvm" + print( + f"[drpai_runtime] MERA/TVM graph_executor backend " + f"(deploy.so, input='{self._input_name}') — real runtime" + ) + return True + except Exception as e: + print(f"[drpai_runtime] TVM backend load failed ({e}); trying ONNX") + return False + + # ---- backend 2: ONNX Runtime look-alike ---- + def _try_load_onnx(self, model_dir): + try: + import onnxruntime as ort + except ImportError: + print("[drpai_runtime] no TVM and no onnxruntime — cannot load") + return False + onnx_files = sorted(glob.glob(os.path.join(model_dir, "*.onnx"))) + if not onnx_files: + print(f"[drpai_runtime] no deploy.so and no .onnx in {model_dir!r}") + return False + self._sess = ort.InferenceSession( + onnx_files[0], providers=["CPUExecutionProvider"] + ) + self._ort_input = self._sess.get_inputs()[0].name + self._backend = "onnx" + print( + f"[drpai_runtime] ONNX Runtime EMULATION backend ({onnx_files[0]}, " + f"input='{self._ort_input}') — NOT the NPU/MERA runtime" + ) + return True + + def set_input(self, index, data): + arr = np.ascontiguousarray(data, dtype=np.float32) + if self._backend == "tvm": + import tvm + + self._mod.set_input(self._input_name, tvm.nd.array(arr, self._dev)) + else: + self._feed = arr + + def run(self): + if self._backend == "tvm": + self._mod.run() + else: + self._outputs = self._sess.run(None, {self._ort_input: self._feed}) + + def num_input(self): + return 1 + + def num_output(self): + if self._backend == "tvm": + return self._mod.get_num_outputs() + return len(self._outputs) if self._outputs is not None else 0 + + def get_output(self, index): + # Flat buffer, like the C++ GetOutput; the engine reshapes it. + if self._backend == "tvm": + return self._mod.get_output(index).numpy().reshape(-1).astype(np.float32) + return np.asarray(self._outputs[index], dtype=np.float32).reshape(-1) diff --git a/rzv2h/emulation/run_emulated.sh b/rzv2h/emulation/run_emulated.sh new file mode 100755 index 0000000..527acf8 --- /dev/null +++ b/rzv2h/emulation/run_emulated.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +# Run the DRP-AI object-detection pipeline on the DEV BOX using the emulated +# drpai_runtime (CPU/ONNX Runtime stand-in) — same engine code as the board, +# but no NPU. For validating the integration before deploying to RZ/V2H. +# +# Usage: ./run_emulated.sh [INPUT.mp4] [OUTPUT.mp4] +set -euo pipefail +HERE="$(cd "$(dirname "$0")" && pwd)" +REPO="$(cd "$HERE/../.." && pwd)" +cd "$REPO" + +source .venv/bin/activate +export GST_PLUGIN_PATH="$REPO/plugins:${GST_PLUGIN_PATH:-}" +export PYTHONPATH="$HERE:${PYTHONPATH:-}" # resolves `import drpai_runtime` to the fake + +IN="${1:-08fd33_4.mp4}" +OUT="${2:-${IN%.*}_drpai_emu.mp4}" +DEPLOY="$HERE/yolo11m_drpai_v2h_emu" # dir containing yolo11m.onnx + +if [[ ! -f "$DEPLOY/yolo11m.onnx" ]]; then + echo "Missing $DEPLOY/yolo11m.onnx — export it first:" >&2 + echo " yolo export model=yolo11m.pt format=onnx imgsz=640 opset=12 simplify=True" >&2 + echo " mkdir -p $DEPLOY && cp yolo11m.onnx $DEPLOY/" >&2 + exit 1 +fi + +echo "EMULATED DRP-AI run: '$IN' -> '$OUT' (CPU/ONNX, not the NPU)" +gst-launch-1.0 -e \ + filesrc location="$IN" ! decodebin ! videoconvert ! videoscale \ + ! "video/x-raw,format=RGB,width=640,height=640" \ + ! pyml_objectdetector engine-name=drpai model-name="$DEPLOY" device=drpai \ + input-format=nchw post-process=anchor_free \ + ! pyml_tracker tracker-type=bytetrack \ + ! videoconvert ! "video/x-raw,format=RGBA" \ + ! pyml_football_overlay show-ids=false show-labels=false \ + ! videoconvert ! openh264enc ! h264parse ! mp4mux ! filesink location="$OUT" +echo "Done: $OUT" diff --git a/rzv2h/sdk_eval/README.md b/rzv2h/sdk_eval/README.md new file mode 100644 index 0000000..13fa30f --- /dev/null +++ b/rzv2h/sdk_eval/README.md @@ -0,0 +1,113 @@ +# Faithful DRP-AI TVM eval (real mera2 / MERA runtime) + +This is the most faithful test short of running on hardware: the **real** +`mera2` compile and the **real** MERA/TVM runtime, instead of the ONNX-RT +look-alike in [../emulation](../emulation). It composes with the same +`engine-name=drpai` + `drpai_runtime` shim we use everywhere else. + +## Read this first — what's gated, and the aarch64 catch + +Two things make this unable to run on a plain x86 box out of the box: + +1. **License-gated downloads (Renesas account required).** The stack build needs + the **DRP-AI Translator i8** and the **RZ/V2H AI SDK** (`RTK0EF0180F06000SJ.zip`). + There is **no public prebuilt image**; you download these and build Renesas' + `Dockerfile`. I cannot fetch them for you. +2. **The compile targets aarch64, not x86.** Even `compile_cpu_only_onnx_model.py` + uses `target = "llvm ... -mtriple=aarch64-linux-gnu"` and the SDK's aarch64 + cross-g++. So `deploy.so` runs on the board's Arm CPU / NPU — to execute it + off-board you either run on the **board**, under **QEMU-aarch64**, or compile + with an **x86 `llvm` target** for a pure functional check (see below). + +If you don't have the downloads, the ONNX-RT emulation in `../emulation` +already validates all of *our* code (engine preprocess/reshape/decode + +pipeline). What's left to validate here is mera2-compile success and runtime +numerics — both inherently need Renesas assets or hardware. + +## Steps + +### 1. Build the SDK image (host, needs the two downloads) + +```bash +mkdir -p rzv2h/sdk_eval/assets +# put both Renesas downloads in rzv2h/sdk_eval/assets/ : +# DRP-AI_Translator_i8-*-Linux-x86_64-Install and RTK0EF0180F06000SJ.zip +cd rzv2h/sdk_eval && ./build_image.sh +``` + +`build_image.sh` fetches the repo `Dockerfile`, assembles a clean build context +(Dockerfile + the toolchain `.sh` it unzips from the AI SDK zip + the Translator +installer), and runs `docker build --build-arg PRODUCT=V2H -t drpai-tvm-v2h`. +The Dockerfile (`FROM ubuntu:22.04`) defaults `PRODUCT=V2H` and builds the TVM +fork itself, so the build takes a while. + +To fetch just the Dockerfile by hand: +`wget https://raw.githubusercontent.com/renesas-rz/rzv_drp-ai_tvm/main/Dockerfile` + +### 2. Compile YOLO11 with the real mera2 (inside the container) + +```bash +docker run -it --rm -v "$PWD":/workspace/gst-python-ml drpai-tvm-v2h bash +# inside: +cd /workspace/gst-python-ml +./rzv2h/convert_yolo11_v2h.sh yolo11m 640 # real mera2.from_onnx + mera2.drp.build +# -> yolo11m_drpai_v2h/{deploy.so,deploy.json,deploy.params} (aarch64) +``` + +For a **host x86 functional check** instead of the board artifact, compile with a +native target (edit a copy of `tutorials/compile_onnx_model.py` to +`target = "llvm"` and drop the aarch64 cross-compiler), producing an x86 +`deploy.so` the MERA/TVM `graph_executor` can run natively. + +### 3. Run through the real MERA/TVM runtime + +The [../emulation/drpai_runtime.py](../emulation/drpai_runtime.py) shim +auto-selects the **MERA/TVM `graph_executor`** backend as soon as the model dir +has `deploy.so/json/params` and `tvm` is importable (true inside this +container). The engine code is unchanged. + +```bash +export GST_PLUGIN_PATH=/workspace/gst-python-ml/plugins:$GST_PLUGIN_PATH +export PYTHONPATH=/workspace/gst-python-ml/rzv2h/emulation:$PYTHONPATH +# (x86 deploy.so) run natively; (aarch64 deploy.so) run under qemu-aarch64 +gst-launch-1.0 filesrc location=08fd33_4.mp4 ! decodebin ! videoconvert ! videoscale \ + ! "video/x-raw,format=RGB,width=640,height=640" \ + ! pyml_objectdetector engine-name=drpai model-name=yolo11m_drpai_v2h device=drpai \ + input-format=nchw post-process=anchor_free \ + ! pyml_tracker ! videoconvert ! "video/x-raw,format=RGBA" \ + ! pyml_football_overlay ! videoconvert ! autovideosink +``` + +The shim prints which backend it picked: +`[drpai_runtime] MERA/TVM graph_executor backend ... — real runtime`. + +## On the actual board + +Two ways to run the same pipeline on the RZ/V2H: + +- **Python graph_executor** — copy the `deploy.so/json/params` + the emulation + shim; if the board image has the MERA/TVM python runtime, it Just Works (the + shim's TVM backend), NPU included. +- **C++ pybind binding** — build [../drpai_runtime_pybind.cpp](../drpai_runtime_pybind.cpp) + per [../README.md](../README.md); the native `drpai_runtime.so` takes + precedence over this shim on `PYTHONPATH`. + +## Verified results (RZ/V2H AI SDK v6.00 + DRP-AI Translator i8 v1.11) + +Both paths were run end-to-end driving the `drpai-tvm-v2h` image on an x86 host: + +- **x86 MERA/TVM runtime test** — `compile_x86_cpu.py` compiled YOLO11 via the + MERA-fork TVM (native `llvm`), and `x86_runtime_check.py` ran it through the + real `graph_executor`: output matched ONNX to **max|Δ| = 6.2e-3**, **22 = 22 + detections** (label `person`). Confirms compile + MERA/TVM runtime + the + `drpai_runtime` shim + our decoder, no NPU needed. +- **Real INT8 NPU compile** — `../convert_yolo11_v2h.sh` (quantized flow) + produced the RZ/V2H deploy dir: `[Finish DRP-AI Translator for V2H]`, + `sub_0000__CPU_DRP_TVM/{deploy.so (65 MB),deploy.json,deploy.params}` + + `preprocess/` (DRP-AI pre-processing objects). aarch64 — runs on the board. + +SDK gotchas the scripts now handle automatically: `run_drp_compiler.sh` ships +non-executable and off-PATH (`chmod +x` + add tutorials to PATH); the quant +script preprocesses calibration as ImageNet-224 instead of 640 (patched). And +V2H **requires** the INT8 quantized flow — the plain FP `compile_onnx_model.py` +drives a legacy translator path the i8 v1.11 layout lacks. diff --git a/rzv2h/sdk_eval/_probe_sysroot.sh b/rzv2h/sdk_eval/_probe_sysroot.sh new file mode 100644 index 0000000..0344994 --- /dev/null +++ b/rzv2h/sdk_eval/_probe_sysroot.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +# Probe the RZ/V2H board rootfs (via the cross-SDK aarch64 sysroot) for the +# GStreamer + Python stack our pipeline needs. Run inside drpai-tvm-v2h. +# Target rootfs sysroot (NOT the x86_64-pokysdk-linux cross-compiler dir). +SR=$(ls -d /opt/*/*/sysroots/*-poky-linux 2>/dev/null | grep -v pokysdk | head -1) +[ -d "$SR" ] || SR=$(ls -d /opt/*/sysroots/*-poky-linux 2>/dev/null | grep -v pokysdk | head -1) +echo "sysroot = $SR" +echo "--- python3 ---"; ls -d "$SR"/usr/lib/python3* 2>/dev/null | head -1 +echo "--- gstreamer core ---"; ls "$SR"/usr/lib/libgstreamer-1.0.so.* 2>/dev/null +grep -h "Version" "$SR"/usr/lib/pkgconfig/gstreamer-1.0.pc 2>/dev/null +echo "--- gst-python loader (libgstpython) ---"; find "$SR" -name 'libgstpython*' 2>/dev/null | head +echo "--- GstAnalytics (lib + typelib) ---" +find "$SR" -iname '*gstanalytics*' 2>/dev/null | head +ls "$SR"/usr/lib/girepository-1.0/ 2>/dev/null | grep -iE 'Analytics|GstApp|GstBase|^Gst-' | head +echo "--- python modules on target: gi / numpy / cairo / cv2 ---" +for m in gi numpy cairo cv2; do + hit=$(find "$SR" -maxdepth 7 -path '*python3*' -iname "${m}" 2>/dev/null | head -1) + echo "$m: ${hit:-MISSING}" +done +echo "--- tvm / mera python runtime on target? ---" +find "$SR" -iname '*tvm*' -o -iname '*mera*' 2>/dev/null | grep -i python | head +echo "--- gstreamer plugins present (count) ---" +ls "$SR"/usr/lib/gstreamer-1.0/*.so 2>/dev/null | wc -l diff --git a/rzv2h/sdk_eval/build_image.sh b/rzv2h/sdk_eval/build_image.sh new file mode 100755 index 0000000..7519a5d --- /dev/null +++ b/rzv2h/sdk_eval/build_image.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash +# Build the Renesas DRP-AI TVM (Mera2) Docker image for RZ/V2H. +# +# This is the *faithful* compile/runtime stack (real mera2 + MERA runtime). +# It needs two downloads that require a Renesas account login — put them in +# ./assets first (this script cannot download them for you). Both arrive as +# ZIPs and can be dropped in as-is: +# +# DRP-AI Translator i8 (ZIP, contains DRP-AI_Translator_i8-*-Linux-x86_64-Install) +# https://www.renesas.com/software-tool/drp-ai-translator-i8 (Downloads tab) +# RZ/V2H AI SDK (RTK0EF0180F*SJ.zip) +# https://www.renesas.com/us/en/software-tool/rzv2h-ai-software-development-kit +# +# The repo Dockerfile COPYs every ./*.sh in the context and runs it, plus +# ./DRP-AI_Translator*-Install. So we assemble a CLEAN context holding only: +# Dockerfile + the SDK toolchain installer (.sh, from the AI SDK zip) + the +# Translator installer (from the Translator zip). +set -euo pipefail +cd "$(dirname "$0")" +ASSETS="${ASSETS:-./assets}" +CTX="${CTX:-./context}" +PRODUCT="${PRODUCT:-V2H}" +TAG="${TAG:-drpai-tvm-v2h}" + +mkdir -p "$ASSETS" +TMPS=() +cleanup() { for d in "${TMPS[@]:-}"; do [[ -n "$d" ]] && rm -rf "$d"; done; } +trap cleanup EXIT + +# --- DRP-AI Translator i8: accept an extracted *-Install or the downloaded zip --- +TR=$(ls "$ASSETS"/DRP-AI_Translator*-Linux*-x86_64-Install 2>/dev/null | head -n1 || true) +if [[ -z "$TR" ]]; then + TRZIP=$(ls "$ASSETS"/*[Tt]ranslator*i8*.zip "$ASSETS"/*DRP-AI_Translator*.zip 2>/dev/null | head -n1 || true) + if [[ -n "$TRZIP" ]]; then + t=$(mktemp -d); TMPS+=("$t") + unzip -o -q "$TRZIP" -d "$t" + TR=$(find "$t" -iname "DRP-AI_Translator*-Linux*-x86_64-Install" | head -n1 || true) + fi +fi + +# --- RZ/V2H AI SDK zip (any v6.x build number) --- +ZIP=$(ls "$ASSETS"/RTK0EF0180F*SJ.zip 2>/dev/null | head -n1 || true) + +if [[ -z "$TR" || -z "$ZIP" ]]; then + echo "Missing gated downloads in $ASSETS (Renesas login required):" >&2 + [[ -z "$TR" ]] && echo " - DRP-AI Translator i8 (zip or extracted *-Install)" >&2 + [[ -z "$ZIP" ]] && echo " - RZ/V2H AI SDK (RTK0EF0180F*SJ.zip)" >&2 + exit 1 +fi + +# Clean build context. +rm -rf "$CTX" && mkdir -p "$CTX" +wget -nc https://raw.githubusercontent.com/renesas-rz/rzv_drp-ai_tvm/main/Dockerfile \ + -O "$CTX/Dockerfile" +cp "$TR" "$CTX/" + +# Unzip the AI SDK and extract its Yocto toolchain installer (.sh) into context. +s=$(mktemp -d); TMPS+=("$s") +unzip -o -q "$ZIP" -d "$s" +# The Yocto toolchain installer is the big *toolchain*.sh (e.g. +# ai_sdk_setup/rz-vlp-...-rzv2h-evk-toolchain-5.0.11.sh). Pick the largest +# match so we don't grab a small board/flash helper script by mistake. +SDK_SH=$(find "$s" -iname "*toolchain*.sh" -printf '%s\t%p\n' | sort -rn | head -n1 | cut -f2-) +[[ -n "$SDK_SH" ]] || { echo "No toolchain .sh found inside $ZIP" >&2; exit 1; } +cp "$SDK_SH" "$CTX/" + +echo "Build context ready in $CTX:" +ls -1 "$CTX" +echo +echo "Building image '$TAG' (PRODUCT=$PRODUCT) — builds the TVM fork, takes a while..." +docker build --build-arg PRODUCT="$PRODUCT" -t "$TAG" "$CTX" + +cat </deploy.{so,json,params} — loadable by tvm.contrib.graph_executor, i.e. +# by the drpai_runtime shim's TVM backend. +# +# Run inside the drpai-tvm-v2h container: +# python3 compile_x86_cpu.py [input_name] [C,H,W] +import os +import sys + +import onnx +import tvm +from tvm import relay +from tvm.relay import transform +from tvm.relay.build_module import build as _build, bind_params_by_name +from tvm.relay.param_dict import save_param_dict +from tvm.ir.transform import Sequential, PassContext + +model_file = sys.argv[1] +out_dir = sys.argv[2] +input_name = sys.argv[3] if len(sys.argv) > 3 else "images" +chw = [int(x) for x in (sys.argv[4].split(",") if len(sys.argv) > 4 else [3, 640, 640])] +input_shape = [1] + chw + +os.makedirs(out_dir, exist_ok=True) +print(f"[x86 compile] {model_file} input {input_name}={input_shape} -> {out_dir}") + +onnx_model = onnx.load_model(model_file) +mod, params = relay.frontend.from_onnx(onnx_model, {input_name: input_shape}) +if params: + mod["main"] = bind_params_by_name(mod["main"], params) + +with PassContext(opt_level=3): + mod = Sequential([ + transform.SimplifyInference(), + transform.FoldConstant(), + transform.FoldExplicitPadding(), + transform.BackwardFoldScaleAxis(), + transform.ForwardFoldScaleAxis(), + transform.FoldConstant(), + transform.DynamicToStatic(), + transform.RemoveUnusedFunctions(), + ])(mod) + +target = "llvm" # native host (x86), no aarch64 cross target +with PassContext(opt_level=3): + graph, lib, all_params = _build(mod, target=target, target_host=target, params=params) + +lib.export_library(os.path.join(out_dir, "deploy.so")) # default host compiler -> x86 .so +with open(os.path.join(out_dir, "deploy.json"), "w") as f: + f.write(graph) +with open(os.path.join(out_dir, "deploy.params"), "wb") as f: + f.write(save_param_dict(all_params)) +print(f"[x86 compile finished] -> {out_dir}/deploy.so,deploy.json,deploy.params") diff --git a/rzv2h/sdk_eval/x86_runtime_check.py b/rzv2h/sdk_eval/x86_runtime_check.py new file mode 100644 index 0000000..d1ee523 --- /dev/null +++ b/rzv2h/sdk_eval/x86_runtime_check.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 +# x86_runtime_check.py — run a real input through the MERA/TVM graph_executor +# (via the drpai_runtime shim's TVM backend) and check parity against the +# known-good ONNX output. Run INSIDE the drpai-tvm-v2h container; needs the +# x86 deploy dir + the pre-saved input/onnx-reference .npy files. +import sys +import numpy as np + +sys.path.insert(0, "/work/rzv2h/emulation") # drpai_runtime shim (TVM backend) +sys.path.insert(0, "/work/plugins/python") # utils.detection_decoder (pure numpy) + +import drpai_runtime +from utils.detection_decoder import decode + +DEPLOY = "/work/rzv2h/yolo11m_x86_cpu" +x = np.load("/work/rzv2h/_x86test_input.npy").astype(np.float32) +ref = np.load("/work/rzv2h/_x86test_onnxout.npy").astype(np.float32).reshape(-1) + +rt = drpai_runtime.Runtime() +assert rt.load(DEPLOY), "drpai_runtime.load failed" +rt.set_input(0, x) +rt.run() +out = np.asarray(rt.get_output(0), dtype=np.float32).reshape(-1) + +n = min(out.size, ref.size) +maxdiff = float(np.max(np.abs(out[:n] - ref[:n]))) if n else float("nan") +print(f"TVM out size={out.size} ref size={ref.size} max|TVM-ONNX|={maxdiff:.3e}") + +tvm_det = decode(out.reshape(1, 84, 8400), "anchor_free")[0] +onnx_det = decode(ref.reshape(1, 84, 8400), "anchor_free")[0] +print(f"detections TVM={len(tvm_det['boxes'])} ONNX={len(onnx_det['boxes'])}") +if len(tvm_det["boxes"]): + print("TVM labels:", sorted(set(int(c) for c in tvm_det["labels"]))) +print("PASS" if maxdiff < 1e-2 and len(tvm_det["boxes"]) == len(onnx_det["boxes"]) else "CHECK") diff --git a/rzv2h/yocto/README.md b/rzv2h/yocto/README.md new file mode 100644 index 0000000..ccfbdb4 --- /dev/null +++ b/rzv2h/yocto/README.md @@ -0,0 +1 @@ +# Custom RZ/V2H image for the gst-python-ml pipeline diff --git a/rzv2h/yocto/meta-gst-python-ml/conf/include/gstreamer-1.24.inc b/rzv2h/yocto/meta-gst-python-ml/conf/include/gstreamer-1.24.inc new file mode 100644 index 0000000..cfa0a3d --- /dev/null +++ b/rzv2h/yocto/meta-gst-python-ml/conf/include/gstreamer-1.24.inc @@ -0,0 +1,25 @@ +# Pin GStreamer to 1.24 across the stack so GstAnalytics is available. +# +# scarthgap's oe-core ships GStreamer 1.22.x. The 1.24 recipes must be present +# in the build (see README: copy the gstreamer1.0* recipes from oe-core +# styhead/master into recipes-multimedia/gstreamer/ of this layer, or layer in +# a newer meta-oe). These PREFERRED_VERSION lines then select them. +# +# require this from local.conf: +# require ${TOPDIR}/../layers/meta-gst-python-ml/conf/include/gstreamer-1.24.inc + +GST_124 ?= "1.24.%" + +PREFERRED_VERSION_gstreamer1.0 = "${GST_124}" +PREFERRED_VERSION_gstreamer1.0-plugins-base = "${GST_124}" +PREFERRED_VERSION_gstreamer1.0-plugins-good = "${GST_124}" +PREFERRED_VERSION_gstreamer1.0-plugins-bad = "${GST_124}" +PREFERRED_VERSION_gstreamer1.0-plugins-ugly = "${GST_124}" +PREFERRED_VERSION_gstreamer1.0-libav = "${GST_124}" +PREFERRED_VERSION_gstreamer1.0-python = "${GST_124}" +PREFERRED_VERSION_gstreamer1.0-rtsp-server = "${GST_124}" +PREFERRED_VERSION_gstreamer1.0-vaapi = "${GST_124}" + +# GstAnalytics + the object-detection / tracking metas live in -plugins-bad. +# Make sure analytics isn't disabled by a PACKAGECONFIG override. +PACKAGECONFIG:append:pn-gstreamer1.0-plugins-bad = " analytics" diff --git a/rzv2h/yocto/meta-gst-python-ml/conf/layer.conf b/rzv2h/yocto/meta-gst-python-ml/conf/layer.conf new file mode 100644 index 0000000..262f68a --- /dev/null +++ b/rzv2h/yocto/meta-gst-python-ml/conf/layer.conf @@ -0,0 +1,15 @@ +# meta-gst-python-ml — adds the runtime stack gst-python-ml needs on RZ/V2H. +# +# The RZ/V2H AI SDK v6.00 image is Yocto scarthgap (5.0.11) with GStreamer +# 1.22.x. gst-python-ml requires GStreamer >= 1.24 (for GstAnalytics, the +# metadata type every pyml_* element uses), the gst-python plugin loader, and +# numpy/pycairo/pygobject/opencv. This layer carries those additions. +BBPATH .= ":${LAYERDIR}" +BBFILES += "${LAYERDIR}/recipes-*/*/*.bb ${LAYERDIR}/recipes-*/*/*.bbappend" + +BBFILE_COLLECTIONS += "gst-python-ml" +BBFILE_PATTERN_gst-python-ml = "^${LAYERDIR}/" +BBFILE_PRIORITY_gst-python-ml = "20" + +LAYERDEPENDS_gst-python-ml = "core openembedded-layer" +LAYERSERIES_COMPAT_gst-python-ml = "scarthgap styhead" diff --git a/rzv2h/yocto/meta-gst-python-ml/recipes-core/packagegroups/packagegroup-gst-python-ml.bb b/rzv2h/yocto/meta-gst-python-ml/recipes-core/packagegroups/packagegroup-gst-python-ml.bb new file mode 100644 index 0000000..4a9cef9 --- /dev/null +++ b/rzv2h/yocto/meta-gst-python-ml/recipes-core/packagegroups/packagegroup-gst-python-ml.bb @@ -0,0 +1,26 @@ +SUMMARY = "Runtime stack for gst-python-ml on RZ/V2H (GStreamer 1.24 + Python)" +LICENSE = "MIT" + +inherit packagegroup + +RDEPENDS:${PN} = " \ + gstreamer1.0 \ + gstreamer1.0-plugins-base \ + gstreamer1.0-plugins-good \ + gstreamer1.0-plugins-bad \ + gstreamer1.0-libav \ + gstreamer1.0-python \ + python3-core \ + python3-pygobject \ + python3-numpy \ + python3-pycairo \ + python3-opencv \ +" +# Notes: +# - gstreamer1.0-python provides the libgstpython.so plugin loader that runs +# the pyml_* .py elements. It is NOT in the stock AI SDK image. +# - GstAnalytics (used by base_objectdetector / tracker / overlay) ships in +# gstreamer1.0-plugins-bad once GStreamer is >= 1.24 with the analytics +# PACKAGECONFIG enabled (see conf/include/gstreamer-1.24.inc). +# - The DRP-AI MERA/TVM *Python* runtime is not a stock Yocto package; install +# it onto the image separately (see ../README.md "DRP-AI runtime on board"). diff --git a/rzv2h/yocto/meta-gst-python-ml/recipes-multimedia/gst-python-ml/gst-python-ml_git.bb b/rzv2h/yocto/meta-gst-python-ml/recipes-multimedia/gst-python-ml/gst-python-ml_git.bb new file mode 100644 index 0000000..3b3245b --- /dev/null +++ b/rzv2h/yocto/meta-gst-python-ml/recipes-multimedia/gst-python-ml/gst-python-ml_git.bb @@ -0,0 +1,35 @@ +SUMMARY = "gst-python-ml elements (pyml_*) + DRP-AI engine for RZ/V2H" +DESCRIPTION = "Installs the pure-Python GStreamer elements and sets GST_PLUGIN_PATH/PYTHONPATH." +LICENSE = "LGPL-2.1-or-later" +LIC_FILES_CHKSUM = "file://COPYING;md5=" + +# Point this at your gst-python-ml source. Examples: +# SRC_URI = "git://github.com/collabora/gst-python-ml.git;branch=main;protocol=https" +# SRCREV = "" +# or a local checkout via: SRC_URI = "file:///path/to/gst-python-ml" +SRC_URI = "git://github.com/collabora/gst-python-ml.git;branch=master;protocol=https" +SRCREV = "${AUTOREV}" +S = "${WORKDIR}/git" + +# Pure-Python elements: nothing to compile. +do_compile[noexec] = "1" + +PYML_DIR = "${datadir}/gst-python-ml" + +do_install() { + install -d ${D}${PYML_DIR} + cp -r ${S}/plugins ${D}${PYML_DIR}/plugins + + # Environment so GStreamer finds the .py elements and Python finds the pkg. + install -d ${D}${sysconfdir}/profile.d + cat > ${D}${sysconfdir}/profile.d/gst-python-ml.sh <