diff --git a/configs/AM62DX/AM62DX_linux_toc.txt b/configs/AM62DX/AM62DX_linux_toc.txt index 2ebb80cf2..edc0c4a76 100644 --- a/configs/AM62DX/AM62DX_linux_toc.txt +++ b/configs/AM62DX/AM62DX_linux_toc.txt @@ -62,6 +62,12 @@ linux/Foundational_Components/Kernel/Kernel_Drivers/UART linux/Foundational_Components/Kernel/Kernel_Drivers/UBIFS linux/Foundational_Components/Kernel/Kernel_Drivers/VTM linux/Foundational_Components/Kernel/Kernel_Drivers/Watchdog +linux/Foundational_Components_Machine_Learning +linux/Foundational_Components/Machine_Learning/arm_compute_library +linux/Foundational_Components/Machine_Learning/armnn +linux/Foundational_Components/Machine_Learning/nnstreamer +linux/Foundational_Components/Machine_Learning/onnxrt +linux/Foundational_Components/Machine_Learning/tflite #linux/Foundational_Components_Power_Management diff --git a/source/images/Sitara_machine_learning_stack_diagram.jpeg b/source/images/Sitara_machine_learning_stack_diagram.jpeg new file mode 100644 index 000000000..9fb067704 Binary files /dev/null and b/source/images/Sitara_machine_learning_stack_diagram.jpeg differ diff --git a/source/images/Sitara_machine_learning_stack_diagram.png b/source/images/Sitara_machine_learning_stack_diagram.png deleted file mode 100644 index 6fb0fe0bf..000000000 Binary files a/source/images/Sitara_machine_learning_stack_diagram.png and /dev/null differ diff --git a/source/linux/Foundational_Components/Machine_Learning/arm_compute_library.rst b/source/linux/Foundational_Components/Machine_Learning/arm_compute_library.rst index 3bfa2de10..43b8ca2fd 100644 --- a/source/linux/Foundational_Components/Machine_Learning/arm_compute_library.rst +++ b/source/linux/Foundational_Components/Machine_Learning/arm_compute_library.rst @@ -10,7 +10,7 @@ Exact list of functions can be found at https://www.arm.com/products/development Supported versions ------------------ - - ARM Compute Library 24.12 + - ARM Compute Library 52.7.0 Arm Compute Library Testing --------------------------- @@ -19,10 +19,10 @@ Arm Compute Libraries, tests, and sample executables are included in the SDK fil .. code-block:: console - root@am62xx-evm:~# LD_LIBRARY_PATH=/usr/lib/tests/ /usr/lib/tests/arm_compute_validation - Version = 32bcced2af7feea6969dd1d22e58d0718dc488e3 - CommandLine = /usr/lib/tests/arm_compute_validation - Seed = 3778037091 + root@am62xx-evm:~# LD_LIBRARY_PATH=/usr/bin/arm-compute-library-52.7.0/tests/ /usr/bin/arm-compute-library-52.7.0/tests/arm_compute_validation + Version = c9a1fff898abd5109b759e8e16616519dc758fdd + CommandLine = /usr/bin/arm-compute-library-52.7.0/tests/arm_compute_validation + Seed = 165977448 cpu_has_sve = false cpu_has_sve2 = false cpu_has_svef32mm = false @@ -34,6 +34,7 @@ Arm Compute Libraries, tests, and sample executables are included in the SDK fil cpu_has_bf16 = false cpu_has_dotprod = false cpu_has_i8mm = false + cpu_has_fhm = false CPU0 = A53 CPU1 = A53 CPU2 = A53 @@ -41,15 +42,15 @@ Arm Compute Libraries, tests, and sample executables are included in the SDK fil Iterations = 1 Threads = 1 Dataset mode = PRECOMMIT - Running [0] 'UNIT/CPPScheduler/RethrowException' - Wall clock/Wall clock time: AVG=3466.0000 us + Running [0] 'UNIT/DataTypeUtils/CheckDataTypeIsPrinted@DataType=QSYMM8' + Wall clock/Wall clock time: AVG=3.0000 us .. code-block:: console - root@am62xx-evm:~# /usr/bin/arm-compute-library-24.12/examples/graph_alexnet + root@am62xx-evm:~# /usr/bin/arm-compute-library-52.7.0/examples/graph_alexnet - /usr/bin/arm-compute-library-24.12/examples/graph_alexnet + /usr/bin/arm-compute-library-52.7.0/examples/graph_alexnet Threads : 1 Target : Neon @@ -58,8 +59,8 @@ Arm Compute Libraries, tests, and sample executables are included in the SDK fil Tuner enabled? : false Cache enabled? : false Tuner mode : Normal - Tuner file : - MLGO file : + Tuner file : + MLGO file : Fast math enabled? : false Test passed @@ -69,16 +70,17 @@ Sample NN related executables (using Arm Compute Library only): .. code-block:: console - root@am62xx-evm:~# ls /usr/bin/arm-compute-library-24.12/examples/graph_* - graph_alexnet graph_inception_v4 graph_resnext50 graph_vgg19 - graph_deepspeech_v0_4_1 graph_lenet graph_shufflenet graph_vgg_vdsr - graph_edsr graph_mobilenet graph_squeezenet graph_yolov3 - graph_googlenet graph_mobilenet_v2 graph_squeezenet_v1_1 - graph_inception_resnet_v1 graph_resnet12 graph_srcnn955 - graph_inception_resnet_v2 graph_resnet50 graph_ssd_mobilenet - graph_inception_v3 graph_resnet_v2_50 graph_vgg16 + root@am62xx-evm:~# ls /usr/bin/arm-compute-library-52.7.0/examples/graph_* + graph_alexnet graph_lenet graph_squeezenet + graph_deepspeech_v0_4_1 graph_mobilenet graph_squeezenet_v1_1 + graph_edsr graph_mobilenet_v2 graph_srcnn955 + graph_googlenet graph_resnet12 graph_ssd_mobilenet + graph_inception_resnet_v1 graph_resnet50 graph_vgg16 + graph_inception_resnet_v2 graph_resnet_v2_50 graph_vgg19 + graph_inception_v3 graph_resnext50 graph_vgg_vdsr + graph_inception_v4 graph_shufflenet graph_yolov3 .. code-block:: console - root@am62xx-evm:~# ls /usr/bin/arm-compute-library-24.12/examples/neon_* + root@am62xx-evm:~# ls /usr/bin/arm-compute-library-52.7.0/examples/neon_* neon_cnn neon_copy_objects neon_gemm_qasymm8 neon_gemm_s8_f32 neon_permute neon_scale neon_sgemm diff --git a/source/linux/Foundational_Components/Machine_Learning/armnn.rst b/source/linux/Foundational_Components/Machine_Learning/armnn.rst index c22f1c2b7..574bf771e 100644 --- a/source/linux/Foundational_Components/Machine_Learning/armnn.rst +++ b/source/linux/Foundational_Components/Machine_Learning/armnn.rst @@ -23,4 +23,4 @@ in conjunction with the TIDL TensorFlow Lite Delegate. Supported versions ------------------ - - Arm NN 24.11 + - Arm NN 26.01 diff --git a/source/linux/Foundational_Components/Machine_Learning/nnstreamer.rst b/source/linux/Foundational_Components/Machine_Learning/nnstreamer.rst index 0f247a7d5..e7c977c88 100644 --- a/source/linux/Foundational_Components/Machine_Learning/nnstreamer.rst +++ b/source/linux/Foundational_Components/Machine_Learning/nnstreamer.rst @@ -12,7 +12,7 @@ https://nnstreamer.ai/ Supported versions ------------------ - - NNStreamer 2.4.2 + - NNStreamer 2.6.0 Testing NNStreamer ------------------ @@ -20,7 +20,7 @@ Testing NNStreamer .. code-block:: console root@am62xx-evm:~# nnstreamer-check - NNStreamer version: 2.4.2 + NNStreamer version: 2.6.0 loaded : TRUE path : /usr/lib/gstreamer-1.0/libnnstreamer.so ... diff --git a/source/linux/Foundational_Components/Machine_Learning/onnxrt.rst b/source/linux/Foundational_Components/Machine_Learning/onnxrt.rst index 5b9da7ab7..45d10440c 100644 --- a/source/linux/Foundational_Components/Machine_Learning/onnxrt.rst +++ b/source/linux/Foundational_Components/Machine_Learning/onnxrt.rst @@ -18,7 +18,7 @@ https://onnxruntime.ai/ Supported version ----------------- - - ONNX Runtime 1.20.1 + - ONNX Runtime 1.23.2 ONNX Runtime test applications ------------------------------ @@ -34,7 +34,7 @@ Running benchmark_model usage: perf_test [options...] model_path [result_file] Options: -m [test_mode]: Specifies the test mode. Value could be 'duration' or 'times'. - Provide 'duration' to run the test for a fix duration, and 'times' to repeated for a certain times. + Provide 'duration' to run the test for a fix duration, and 'times' to repeated for a certain times. -M: Disable memory pattern. -A: Disable memory arena -I: Generate tensor input binding (Free dimensions are treated as 1.) @@ -55,19 +55,19 @@ Running benchmark_model -o [optimization level]: Default is 99 (all). Valid values are 0 (disable), 1 (basic), 2 (extended), 99 (all). Please see onnxruntime_c_api.h (enum GraphOptimizationLevel) for the full list of all optimization levels. -u [optimized_model_path]: Specify the optimized model path for saving. - -d [CUDA only][cudnn_conv_algorithm]: Specify CUDNN convolution algorithms: 0(benchmark), 1(heuristic), 2(default). - -q [CUDA only] use separate stream for copy. + -d [CUDA only][cudnn_conv_algorithm]: Specify CUDNN convolution algorithms: 0(benchmark), 1(heuristic), 2(default). + -q [CUDA only] use separate stream for copy. -z: Set denormal as zero. When turning on this option reduces latency dramatically, a model may have denormals. - -C: Specify session configuration entries as key-value pairs: -C "| |" - Refer to onnxruntime_session_options_config_keys.h for valid keys and values. - [Example] -C "session.disable_cpu_ep_fallback|1 ep.context_enable|1" - -i: Specify EP specific runtime options as key value pairs. Different runtime options available are: + -C: Specify session configuration entries as key-value pairs: -C "| |" + Refer to onnxruntime_session_options_config_keys.h for valid keys and values. + [Example] -C "session.disable_cpu_ep_fallback|1 ep.context_enable|1" + -i: Specify EP specific runtime options as key value pairs. Different runtime options available are: [Usage]: -e -i '| |' - [ACL only] [enable_fast_math]: Options: 'true', 'false', default: 'false', + [ACL only] [enable_fast_math]: Options: 'true', 'false', default: 'false', -T [Set intra op thread affinities]: Specify intra op thread affinity string - [Example]: -T 1,2;3,4;5,6 or -T 1-2;3-4;5-6 + [Example]: -T 1,2;3,4;5,6 or -T 1-2;3-4;5-6 Use semicolon to separate configuration between threads. E.g. 1,2;3,4;5,6 specifies affinities for three threads, the first thread will be attached to the first and second logical processor. The number of affinities must be equal to intra_op_num_threads - 1 @@ -84,22 +84,22 @@ Example of running *onnxruntime_perf_test* on target using the pre-installed mob .. code-block:: console # /usr/bin/onnxruntime-tests/onnxruntime_perf_test -I -m times -r 8 -e acl -P /usr/bin/onnxruntime-tests/testdata/mobilenet_v3_small_excerpt.onnx - Session creation time cost: 0.0273071 s - First inference time cost: 20 ms - Total inference time cost: 0.14188 s + Session creation time cost: 0.139671 s + First inference time cost: 15 ms + Total inference time cost: 0.126396 s Total inference requests: 8 - Average inference time cost: 17.735 ms - Total inference run time: 0.141991 s - Number of inferences per second: 56.3415 - Avg CPU usage: 98 % - Peak working set size: 35299328 bytes - Avg CPU usage:98 - Peak working set size:35299328 + Average inference time cost: 15.7995 ms + Total inference run time: 0.126518 s + Number of inferences per second: 63.232 + Avg CPU usage: 100 % + Peak working set size: 37994496 bytes + Avg CPU usage:100 + Peak working set size:37994496 Runs:8 - Min Latency: 0.0159831 s - Max Latency: 0.0232702 s - P50 Latency: 0.0167086 s - P90 Latency: 0.0232702 s - P95 Latency: 0.0232702 s - P99 Latency: 0.0232702 s - P999 Latency: 0.0232702 s + Min Latency: 0.00955697 s + Max Latency: 0.0239688 s + P50 Latency: 0.0156388 s + P90 Latency: 0.0239688 s + P95 Latency: 0.0239688 s + P99 Latency: 0.0239688 s + P999 Latency: 0.0239688 s diff --git a/source/linux/Foundational_Components/Machine_Learning/tflite.rst b/source/linux/Foundational_Components/Machine_Learning/tflite.rst index 3a4de7735..567cef372 100644 --- a/source/linux/Foundational_Components/Machine_Learning/tflite.rst +++ b/source/linux/Foundational_Components/Machine_Learning/tflite.rst @@ -18,7 +18,7 @@ It supports on-device inference with low latency and a compact binary size. You Features ******** - - TensorFlow Lite v2.18.0 via Yocto - `meta-arago-extras/recipes-framework/tensorflow-lite/tensorflow-lite_2.18.0.bb `__ + - TensorFlow Lite v2.20.0 via Yocto - `meta-arago-extras/recipes-framework/tensorflow-lite/tensorflow-lite_2.20.0.bb `__ - Multithreaded computation with acceleration using Arm Neon SIMD instructions on Cortex-A cores - C++ Library and Python interpreter (supported Python version 3) - TensorFlow Lite Model benchmark Tool (i.e. :command:`benchmark_model`) @@ -89,23 +89,21 @@ The output of the benchmarking application should be similar to: root@am62xx-evm:~# /opt/tensorflow-lite/tools/benchmark_model --graph=/usr/share/oob-demo-assets/models/ssd_mobilenet_v2_coco.tflite --num_threads=4 --use_xnnpack=false INFO: STARTING! INFO: Log parameter values verbosely: [0] - INFO: Num threads: [4] INFO: Graph: [/usr/share/oob-demo-assets/models/ssd_mobilenet_v2_coco.tflite] INFO: Signature to run: [] - INFO: #threads used for CPU inference: [4] INFO: Use xnnpack: [0] INFO: Loaded model /usr/share/oob-demo-assets/models/ssd_mobilenet_v2_coco.tflite INFO: The input model file size (MB): 67.3128 - INFO: Initialized session in 6.418ms. + INFO: Initialized session in 5.579ms. INFO: Running benchmark for at least 1 iterations and at least 0.5 seconds but terminate if exceeding 150 seconds. - INFO: count=1 curr=1041765 + INFO: count=1 curr=1357602 p5=1357602 median=1357602 p95=1357602 INFO: Running benchmark for at least 50 iterations and at least 1 seconds but terminate if exceeding 150 seconds. - INFO: count=50 first=977738 curr=964908 min=911877 max=1112273 avg=971535 std=39112 + INFO: count=50 first=1249964 curr=1240143 min=1238588 max=1252566 avg=1.24027e+06 std=2565 p5=1238753 median=1239807 p95=1247415 - INFO: Inference timings in us: Init: 6418, First inference: 1041765, Warmup (avg): 1.04176e+06, Inference (avg): 971535 + INFO: Inference timings in us: Init: 5579, First inference: 1357602, Warmup (avg): 1.3576e+06, Inference (avg): 1.24027e+06 INFO: Note: as the benchmark tool itself affects memory footprint, the following is only APPROXIMATE to the actual memory footprint of the model at runtime. Take the information at your discretion. - INFO: Memory footprint delta from the start of the tool (MB): init=6.14844 overall=109.848 + INFO: Memory footprint delta from the start of the tool (MB): init=6.36328 overall=109.832 Where, @@ -130,26 +128,23 @@ The output of the benchmarking application should be similar to, root@am62xx-evm:~# /opt/tensorflow-lite/tools/benchmark_model --graph=/usr/share/oob-demo-assets/models/ssd_mobilenet_v2_coco.tflite --num_threads=4 --use_xnnpack=true INFO: STARTING! INFO: Log parameter values verbosely: [0] - INFO: Num threads: [4] INFO: Graph: [/usr/share/oob-demo-assets/models/ssd_mobilenet_v2_coco.tflite] INFO: Signature to run: [] - INFO: #threads used for CPU inference: [4] INFO: Use xnnpack: [1] INFO: Loaded model /usr/share/oob-demo-assets/models/ssd_mobilenet_v2_coco.tflite INFO: Created TensorFlow Lite XNNPACK delegate for CPU. INFO: XNNPACK delegate created. INFO: Explicitly applied XNNPACK delegate, and the model graph will be partially executed by the delegate w/ 1 delegate kernels. INFO: The input model file size (MB): 67.3128 - INFO: Initialized session in 592.232ms. + INFO: Initialized session in 614.333ms. INFO: Running benchmark for at least 1 iterations and at least 0.5 seconds but terminate if exceeding 150 seconds. - INFO: count=1 curr=633430 - + INFO: count=1 curr=905463 p5=905463 median=905463 p95=905463 INFO: Running benchmark for at least 50 iterations and at least 1 seconds but terminate if exceeding 150 seconds. - INFO: count=50 first=605745 curr=618849 min=568228 max=722188 avg=602943 std=27690 - - INFO: Inference timings in us: Init: 592232, First inference: 633430, Warmup (avg): 633430, Inference (avg): 602943 + INFO: count=50 first=900416 curr=898333 min=898007 max=906121 avg=899641 std=1549 p5=898333 median=899281 p95=904305 + INFO: Inference timings in us: Init: 614333, First inference: 905463, Warmup (avg): 905463, Inference (avg): 899641 INFO: Note: as the benchmark tool itself affects memory footprint, the following is only APPROXIMATE to the actual memory footprint of the model at runtime. Take the information at your discretion. - INFO: Memory footprint delta from the start of the tool (MB): init=133.086 overall=149.531 + INFO: Memory footprint delta from the start of the tool (MB): init=146.363 overall=150.141 + Where, @@ -166,14 +161,14 @@ The following performance numbers are captured with :command:`benchmark_model` o :header: "SOC", "Delegates", "Inference Time (sec)", "Initialization Time (ms)", "Overall Memory Footprint (MB)" :widths: 10, 10, 20, 20, 20 - "AM62X", "CPU only", "0.977168", "6.129", "110.07" - "", "XNNPACK", "0.613474", "593.558", "149.699" - "AM62PX", "CPU only", "0.419261", "4.79", "108.707" - "", "XNNPACK", "0.274756", "1208.04", "149.395" - "AM64X", "CPU only", "1.10675", "144.535", "109.562" - "", "XNNPACK", "0.702809", "601.33", "149.602" - "AM62L", "CPU only", "1.04867", "6.088", "110.129" - "", "XNNPACK", "0.661133", "466.216", "149.703" + "AM62X", "CPU only", "1.24027", "5.579", "109.832" + "", "XNNPACK", "0.899641", "614.333", "150.141" + "AM62PX", "CPU only", "1.23341", "252.390", "111.121" + "", "XNNPACK", "0.875280", "597.639", "150.52" + "AM64X", "CPU only", "1.26429", "135.579", "110.188" + "", "XNNPACK", "0.740743", "885.636", "150.484" + "AM62L", "CPU only", "1.3708", "807.076", "111.152" + "", "XNNPACK", "0.930577", "769.145", "150.496" Based on the above data, using the XNNPACK delegate significantly improves inference times across all SoCs, though it generally increases initialization time and overall memory footprint. diff --git a/source/linux/Foundational_Components_Machine_Learning.rst b/source/linux/Foundational_Components_Machine_Learning.rst index 5028f0fab..acd39ca5f 100644 --- a/source/linux/Foundational_Components_Machine_Learning.rst +++ b/source/linux/Foundational_Components_Machine_Learning.rst @@ -10,7 +10,7 @@ Machine Learning and use. Sitara machine learning today consists of ONNX Runtime, TensorFlow Lite, Arm NN, NNStreamer, and RNN library. - .. figure:: ../images/Sitara_machine_learning_stack_diagram.png + .. figure:: ../images/Sitara_machine_learning_stack_diagram.jpeg :align: center Sitara Machine Learnining Offering @@ -36,17 +36,19 @@ Machine Learning * Imports ONNX and TensorFlow Lite models. * Provides TensorFlow Lite delegate. - .. rubric:: `RNN Library `__ + .. rubric:: `Arm Compute Library `__ - * Provides Long Short-Term Memory (LSTM) and fully connected layers in a standalone library to allow for rapid prototyping of inference applications that require Recurrent Neural Networks. + * Open source inference engine available from Arm. * Runs on all Cortex-A ARM cores (AM3x, AM4x, AM5x, AM6x Sitara devices). - * Integrated into TI's Processor SDK Linux in an OOB demo for `Predictive Maintenance `__. + * Provides highly optimized kernels for NEON and CPU acceleration. + * Used as a backend to accelerate ML frameworks like Arm NN. - .. rubric:: `TI Deep Learning (TIDL) `__ + .. rubric:: `NNStreamer `__ - * Accelerates deep learning inference on C66x DSP cores and/or on Embedded Vision Engine (EVE) subsystems. - * Available on AM57x device only. - * Supports CNN at the moment, and imports Caffe, ONNX, and Tensorflow models. + * Open source framework based on GStreamer for neural network pipelines. + * Runs on all Cortex-A ARM cores (AM3x, AM4x, AM5x, AM6x Sitara devices). + * Supports multiple backends such as TensorFlow Lite and Arm NN. + * Enables easy integration of ML inference into streaming pipelines. .. ifconfig:: CONFIG_part_family in ('J7_family') @@ -55,17 +57,20 @@ Machine Learning TI's Processor SDK Linux, free to download and use. Jacinto machine learning today consists of Neo-AI-DLR library. -+--------------------------+-----------+-----------------------+--------------------+--------------------+ -| ML inference Library | Version | Delegate / | Python API | C/C++ API | -| | | Execution provider | | | -+==========================+===========+=======================+====================+====================+ -| TensorFlow Lite | 2.18.0 | CPU, XNNPACK, ARMNN | YES | YES | -+--------------------------+-----------+-----------------------+--------------------+--------------------+ -| ONNX Runtime | 1.20.1 | CPU, ACL | YES | YES | -+--------------------------+-----------+-----------------------+--------------------+--------------------+ -| Arm NN | 1.20.1 | ACL | YES | YES | -+--------------------------+-----------+-----------------------+--------------------+--------------------+ - ++--------------------------+-----------+-------------------------+--------------------+--------------------+ +| ML inference Library | Version | Delegate / | Python API | C/C++ API | +| | | Execution provider | | | ++==========================+===========+=========================+====================+====================+ +| TensorFlow Lite | 2.20.0 | CPU, XNNPACK, ARMNN | YES | YES | ++--------------------------+-----------+-------------------------+--------------------+--------------------+ +| ONNX Runtime | 1.23.2 | CPU, ACL | YES | YES | ++--------------------------+-----------+-------------------------+--------------------+--------------------+ +| Arm NN | 26.01 | ACL | YES | YES | ++--------------------------+-----------+-------------------------+--------------------+--------------------+ +| Arm Compute Library | 52.7.0 | NA (Backend Library) | Yes | YES | ++--------------------------+-----------+-------------------------+--------------------+--------------------+ +| NNStreamer | 2.6.0 | NA (Pipeline Framework) | Yes | YES | ++--------------------------+-----------+-------------------------+--------------------+--------------------+ .. toctree::