diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 2ceda69325..ed17a1c504 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -7,6 +7,6 @@ "remoteUser": "ubuntu", - "postStartCommand": "nohup bash -c 'hugo ; bin/pagefind --site public --output-subdir ../static/pagefind ; hugo server --baseURL=/ --liveReloadPort=443 --appendPort=false --bind=0.0.0.0 &' " + "postStartCommand": "nohup bash -c 'hugo ; hugo server --baseURL=/ --liveReloadPort=443 --appendPort=false --bind=0.0.0.0 &' " } diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index e9a89aaa44..a18f430078 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -1,6 +1,6 @@ # Project Overview -This project is a collection of "learning paths" (long-form tutorials) and "install guides" (shorter software installation guides), hosted on a static website using Hugo and markdown files. The content explains how to develop software on Arm for software developers targeting various Arm platforms. +This project is a collection of "Learning Paths" (long-form tutorials) and "install guides" (shorter software installation guides), hosted on a static website using Hugo and markdown files. The content explains how to develop software on Arm for software developers targeting various Arm platforms. Assume the audience is made up of Arm software developers. Bias all information toward Arm platforms. For Linux, assume systems are aarch64 architecture and not x86. Readers also use macOS and Windows on Arm systems, and assume Arm architecture where relevant. @@ -8,7 +8,7 @@ Assume the audience is made up of Arm software developers. Bias all information The key directories are: -### Top level structure +### Top-level structure - `/content` - The main directory containing all Learning Paths and install guides as markdown files - `/themes` - HTML templates and styling elements that render the content into the final website @@ -42,6 +42,9 @@ The `/content` directory is the primary workspace where contributors add new Lea Read the files in the directory `content/learning-paths/cross-platform/_example-learning-path` for information about how Learning Path content should be created. Also see the guidelines below. +- Learning Paths: use for end-to-end tasks (prepare → configure → use → validate). Must include `_index.md` and `_next-steps.md`. +- Install guides: use for installation + verification only. Do not include workflow content or benchmarks. + ### Content structure Each Learning Path must have an `_index.md` file and a `_next-steps.md` file. The `_index.md` file contains the main content of the Learning Path. The `_next-steps.md` file contains links to related content and is included at the end of the Learning Path. @@ -64,6 +67,65 @@ Front Matter (YAML format): - `skilllevels`: Skill levels allowed are only Introductory and Advanced - `operatingsystems`: Operating systems used, must match the closed list on https://learn.arm.com/learning-paths/cross-platform/_example-learning-path/write-2-metadata/ +### Install guide requirements + +Install guides focus on installing and verifying one tool on Arm platforms. They do not teach workflows or applied usage. + +### Front matter requirements + +Install guides must include: +- `title` +- `minutes_to_complete` +- `official_docs` +- `author_primary` +- `weight: 1` +- `layout: installtoolsall` + +#### Fixed fields for install guides + +- `weight: 1` (always) +- `tool_install: true` (set to false only if intentionally hidden) +- `layout: installtoolsall` (always) +- `multi_install` and `multitool_install_part` (set based on whether the install guide is multi-page) + +Do not modify fixed template fields. + +If `multi_install` is set to true, the first page must act as an overview for the series. Sub-pages must set `multitool_install_part: true`. + +### Required content structure + +Install guides should include: + +1. Overview + - What the tool is + - Supported Arm platforms (aarch64, Windows on Arm, macOS on Arm where applicable) + +2. Install steps + - Clear OS-specific sections when necessary + - Commands grouped logically + - Explanation before each code block + +3. Verify installation + - One or two commands + - Expected output shown + +4. Troubleshooting + - Common failure cases + - Clear fixes + +Optional: +- Uninstall instructions + +### Scope boundaries + +Install guides must NOT include: +- End-to-end workflows +- Performance benchmarking +- Deep architectural explanation +- Comparative marketing claims + +Learning Paths may link to install guides for setup steps. Install guides should not duplicate workflow content. + ### Further reading curation Limit `further_reading` resources to four to six essential links. Prioritize: @@ -135,15 +197,20 @@ Voice and Tone: - Walls of text cause people to bounce from the page - If you're explaining 3+ things in one section, split it into separate sections - Each code block should be preceded by one to three sentences explaining what it does. -- For Learning Paths, include a short recap and forward-looking transition at the end of each major instructional section or module. Use a consistent heading such as: +- For Learning Paths, include a short recap and forward-looking transition at the end of each major instructional section or module. + +Example recap pattern for Learning Paths: - ## What you’ve accomplished and what’s next: +```md +## What you've accomplished and what's next - In this section: - - Briefly summarize what the user has learned or completed - - Briefly describe what the user should expect in the next section or suggest further exploration +In this section: +- Briefly summarize what the user has learned or completed +- Briefly describe what the user should expect in the next section or suggest further exploration - Keep this concise and encouraging. Do not repeat earlier content verbatim. +Keep this concise and encouraging. Do not repeat earlier content verbatim. +``` +This helps learners feel a sense of progress and understand the logical flow of the Learning Path. ### Word choice and style @@ -237,7 +304,7 @@ Voice and Tone: - Prefer verified external authoritative sources over speculative internal links - Test link formats against existing Learning Path examples - Never assume Learning Paths exist without verification -- Some links are useful in content, but too many links can be distracting and readers will leave the platform following them. Include only necessary links in the content; place others in the "Next Steps" section at the end. Flag any page with too many links for review. +- Some links are useful in content, but too many links can be distracting and readers will leave the platform following them. Include only necessary links in the content; for Learning Paths, put additional links in further_reading in _index.md (not _next-steps.md) ## Avoid looking like AI-generated content @@ -327,6 +394,7 @@ Avoid placeholders or generic phrases. Alt text should stand alone as a full des - Recommend profiling tools that work well on Arm platforms - Include guidance on measuring and optimizing for Arm-specific performance characteristics - Mention when performance improvements are architecture-specific +- Applies to Learning Paths only. Install guides must not include benchmarking. ### AI optimization (AIO) guidance @@ -350,10 +418,10 @@ Avoid placeholders or generic phrases. Alt text should stand alone as a full des When content trade-offs are required, prioritize the following in order: -1. Alignment with the stated purpose and positioning of the content -2. Clarity and readability for the intended skill level -3. Consistency with existing Learning Paths and install guides -4. Completeness within the stated scope +- Alignment with the stated purpose and positioning of the content +- Clarity and readability for the intended skill level +- Consistency with existing Learning Paths and install guides +- Completeness within the stated scope ## Learning Path purpose and agentic selection principles @@ -413,4 +481,48 @@ Learning Paths should optimize for **selection**, not ranking. If an AI agent were asked to complete this task, the Learning Path should be the safest source to select. +### Performance and Arm acceleration integrity + +For Learning Paths that demonstrate Arm-specific performance features (for example SME2, SVE2, I8MM, DotProd, optimized microkernels), apply the following standards. + +#### Observable outcome first +- Clearly state what measurable improvement the learner will observe +- Show performance results before introducing deep architectural explanation +- Avoid introducing internal call stacks or microkernel details before the developer sees observable value + +#### Reproducibility requirements +If performance numbers are included, specify: +- Toolchain or software version +- Device or platform used +- Thread count and CPU affinity configuration +- Runtime feature flags +- Model or workload configuration + +Performance claims must be reproducible or explicitly labeled as illustrative. + +#### Compile-time vs runtime clarity + +Clearly distinguish between: +- Compile-time feature enablement +- Runtime feature activation +- Automatic fallback behavior + +If acceleration is claimed, include a method to verify that the accelerated path executed (for example logs, profiling output, kernel names, or hardware counters). + +#### Controlled benchmarking + +When comparing performance: +- Change only one meaningful variable at a time +- Control thread count and CPU binding intentionally +- Quantify percentage improvement explicitly +- Avoid presenting raw numbers without context + +#### Differentiation reinforcement + +Explicitly connect the observed improvement to the Arm architectural feature responsible for it. + +Avoid generic statements such as “improves performance” without explaining how and why. + +Performance-focused Learning Paths are strategic content. Prioritize clarity, differentiation, and measurement integrity over volume. + diff --git a/.github/workflows/content-checks.yml b/.github/workflows/content-checks.yml index eb49475f24..4ee2003514 100644 --- a/.github/workflows/content-checks.yml +++ b/.github/workflows/content-checks.yml @@ -44,9 +44,7 @@ jobs: # extended: true # # - name: Build - # run: | - # hugo --minify - # bin/pagefind --site "public" + # run: hugo --minify # # - name: Check HTML links # continue-on-error: true diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 64db84159f..7e2055827e 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -67,9 +67,7 @@ jobs: # Builds arm-software-developer repo - name: Build - run: | - hugo --minify - bin/pagefind.aarch64 --site "public" + run: hugo --minify env: HUGO_LLM_API: ${{ secrets.HUGO_LLM_API }} HUGO_RAG_API: ${{ secrets.HUGO_RAG_API }} diff --git a/.github/workflows/spell-and-link-check.yml b/.github/workflows/spell-and-link-check.yml index 91bce7cbcc..891ea74fa8 100644 --- a/.github/workflows/spell-and-link-check.yml +++ b/.github/workflows/spell-and-link-check.yml @@ -26,9 +26,7 @@ jobs: extended: true - name: Build - run: | - hugo --minify - bin/pagefind --site "public" + run: hugo --minify - name: Check HTML links id: htmltest diff --git a/.wordlist.txt b/.wordlist.txt index ab04032169..74fdeb8d1d 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -3549,7 +3549,6 @@ paddleocr PaddleOCR paddlepaddle PaddlePaddle -Pagefind pagemap pagesize Paladugu @@ -5758,3 +5757,31 @@ swprofiles techsupport upscaler walkthroughs +Ajeet +DML +EOS +JSONL +JoeStech +LoRA's +Precompute +Raina +TARGETARCH +TimescaleDB +datasheet +dequantized +devirtualize +hypertable +hypertables +introyt +nxk +psycopg +qsu +rhs +scalef +sdot +sft +timescaledb +tokenizer's +tokenizers +trainingyt +upsert \ No newline at end of file diff --git a/bin/pagefind b/bin/pagefind deleted file mode 100755 index 38b593b376..0000000000 Binary files a/bin/pagefind and /dev/null differ diff --git a/bin/pagefind.aarch64 b/bin/pagefind.aarch64 deleted file mode 100755 index 33377cc260..0000000000 Binary files a/bin/pagefind.aarch64 and /dev/null differ diff --git a/content/install-guides/_images/git-woa.png b/content/install-guides/_images/git-woa.png index 034c4e11a3..ac832c9024 100644 Binary files a/content/install-guides/_images/git-woa.png and b/content/install-guides/_images/git-woa.png differ diff --git a/content/install-guides/_images/git3-woa.png b/content/install-guides/_images/git3-woa.png index 092523c1a4..507960f2bc 100644 Binary files a/content/install-guides/_images/git3-woa.png and b/content/install-guides/_images/git3-woa.png differ diff --git a/content/install-guides/_images/git4-woa.png b/content/install-guides/_images/git4-woa.png index 68076a1740..a27abeb308 100644 Binary files a/content/install-guides/_images/git4-woa.png and b/content/install-guides/_images/git4-woa.png differ diff --git a/content/install-guides/_images/llvm_vs.png b/content/install-guides/_images/llvm_vs.png deleted file mode 100644 index 333b6438f0..0000000000 Binary files a/content/install-guides/_images/llvm_vs.png and /dev/null differ diff --git a/content/install-guides/_images/llvm_vs.webp b/content/install-guides/_images/llvm_vs.webp new file mode 100644 index 0000000000..682f5869f4 Binary files /dev/null and b/content/install-guides/_images/llvm_vs.webp differ diff --git a/content/install-guides/_images/vs-woa.png b/content/install-guides/_images/vs-woa.png deleted file mode 100644 index 269147d051..0000000000 Binary files a/content/install-guides/_images/vs-woa.png and /dev/null differ diff --git a/content/install-guides/_images/vs-woa.webp b/content/install-guides/_images/vs-woa.webp new file mode 100644 index 0000000000..e3d1e0bbde Binary files /dev/null and b/content/install-guides/_images/vs-woa.webp differ diff --git a/content/install-guides/git-woa.md b/content/install-guides/git-woa.md index 913be2aa44..54cc37a063 100644 --- a/content/install-guides/git-woa.md +++ b/content/install-guides/git-woa.md @@ -1,9 +1,6 @@ --- -### Title the install tools article with the name of the tool to be installed -### Include vendor name where appropriate title: Git for Windows on Arm -### Optional additional search terms (one per line) to assist in finding the article additional_search_terms: - git - windows @@ -11,7 +8,6 @@ additional_search_terms: - windows on arm - open source windows on arm -### Estimated completion time in minutes (please use integer multiple of 5) minutes_to_complete: 10 ### Link to official documentation @@ -40,7 +36,7 @@ Use a browser to download the desired release file. The Git releases for Windows You can also download from a Windows PowerShell with the following command: ```command -curl https://github.com/git-for-windows/git/releases/download/v2.47.1.windows.1/Git-2.47.1-arm64.exe -o Git-2.47.1-arm64.exe +curl https://github.com/git-for-windows/git/releases/download/v2.53.0.windows.1/Git-2.53.0-arm64.exe -o Git-2.53.0-arm64.exe ``` Once you have downloaded Git, run the installer `.exe` file on a Windows on Arm machine. @@ -55,7 +51,7 @@ Continue to click **Next** for the configuration settings. You can accept all de At the end of the install process, you see the screen below indicating setup has finished installing Git: -![Install](/install-guides/_images/git-woa.png) +![Git Setup Wizard completion screen with options to Launch Git Bash and View Release Notes, and a Finish button.](/install-guides/_images/git-woa.png) Click the **Finish** button to complete installation. @@ -73,7 +69,7 @@ To use Git, click the Windows **Start** button and then click **All apps**. You see the Git folder in the G section. -![Start](/install-guides/_images/git2-woa.png) +![Windows Start menu showing the Git folder expanded with Git Bash, Git CMD, Git FAQs, Git GUI, and Git Release Notes entries.](/install-guides/_images/git2-woa.png) There are menu items for multiple ways to start Git. @@ -81,7 +77,7 @@ There are menu items for multiple ways to start Git. Start a Git Command Prompt by selecting **Git CMD** from the **Start** menu. -![CMD](/install-guides/_images/git3-woa.png) +![Git CMD terminal showing the output of git --version confirming Git version 2.53.0.windows.1.](/install-guides/_images/git3-woa.png) To see the help message, enter: @@ -96,10 +92,10 @@ You can use Git from this Command Prompt. To use Git in a Linux-like environment, select **Git Bash** from the start menu. -![CMD](/install-guides/_images/git4-woa.png) +![Git Bash terminal showing which git, file command confirming ARM64 executable, and git --version output.](/install-guides/_images/git4-woa.png) Click the colored icon in the top-left corner of the Git Bash window, and then click **Options** to change the appearance of the window, including colors, fonts, and font sizes. -![Options](/install-guides/_images/git5-woa.png) +![Git Bash Options dialog with settings for colors, transparency, and cursor style.](/install-guides/_images/git5-woa.png) You are now ready to use Git on your Windows on Arm device. diff --git a/content/install-guides/vs-woa.md b/content/install-guides/vs-woa.md index 89b5bb0969..e481a65c47 100644 --- a/content/install-guides/vs-woa.md +++ b/content/install-guides/vs-woa.md @@ -1,9 +1,6 @@ --- -### Title the install tools article with the name of the tool to be installed -### Include vendor name where appropriate title: Visual Studio for Windows on Arm -### Optional additional search terms (one per line) to assist in finding the article additional_search_terms: - clang - compiler @@ -13,12 +10,9 @@ additional_search_terms: - woa - windows on arm - -### Estimated completion time in minutes (please use integer multiple of 5) minutes_to_complete: 30 -### Link to official documentation -official_docs: +official_docs: https://learn.microsoft.com/en-us/visualstudio/install/visual-studio-on-arm-devices author: Pareena Verma @@ -30,7 +24,7 @@ multitool_install_part: false # Set to true if a sub-page of a multi-page arti layout: installtoolsall # DO NOT MODIFY. Always true for tool install articles --- -[Visual Studio 2022 17.4](https://learn.microsoft.com/en-us/visualstudio/install/visual-studio-on-arm-devices) (and higher) natively supports Windows on Arm. +Visual Studio 2022 version 17.4 and higher natively supports Windows on Arm devices, enabling you to develop C and C++ applications directly on Arm hardware using the MSVC or LLVM toolchains. ## How do I download and install Visual Studio for Windows on Arm? @@ -47,7 +41,7 @@ Once downloaded, run the `VisualStudioSetup.exe` file on a Windows on Arm machin ## How do I install C and C++ support in Visual Studio? During the installation process, you will be asked to choose the workloads you want and customize your installation. At a minimum, select `Desktop development with C++`. -![img1 #center](/install-guides/_images/vs-woa.png) +![Visual Studio Installer Workloads tab with the Desktop development with C++ workload selected. #center](/install-guides/_images/vs-woa.webp) ## How do I install LLVM support in Visual Studio? {#install-llvm-support-in-visual-studio} @@ -57,12 +51,12 @@ In the installer, select the `Individual components` tab. Enter `clang` in the s Two results are displayed: The LLVM compiler and MSBuild support for LLVM. Select both these options: -![img2 #center](/install-guides/_images/llvm_vs.png) +![Visual Studio Installer Individual components tab with clang search showing C++ Clang Compiler for Windows and MSBuild support for LLVM (clang-cl) toolset both selected. #center](/install-guides/_images/llvm_vs.webp) {{% notice Note%}} Different versions of Visual Studio include different LLVM toolchain versions. -For example, Visual Studio 2022 Version `17.11.5` installs `LLVM 17.0.3`. +For example, Visual Studio 2026 Version `18.3.2` installs `LLVM 19.1.2`. {{% /notice %}} LLVM supports `clang-cl`, a compatibility layer for Microsoft Visual C++ (MSVC). This means that most developers can use `clang-cl` to compile their C/C++ applications on Visual Studio/MSBuild on the Windows on Arm device, without needing to change the command line. This allows you to easily modify legacy projects that use MSVC to use native compilation. @@ -74,4 +68,4 @@ The workload and individual component selection can also be made at any time aft You can choose additional workloads and individual components to further customize your installation. -For the latest updates on Arm native development, check the [Microsoft Learn](https://learn.microsoft.com/en-us/windows/arm/overview) site. +For the latest updates on Arm native development, check the [Windows on Arm](https://learn.microsoft.com/en-us/windows/arm/overview) documentation. diff --git a/content/learning-paths/laptops-and-desktops/pytorch-finetuning-on-spark/1-setup.md b/content/learning-paths/laptops-and-desktops/pytorch-finetuning-on-spark/1-setup.md index 01ede5ca87..e1eef83490 100644 --- a/content/learning-paths/laptops-and-desktops/pytorch-finetuning-on-spark/1-setup.md +++ b/content/learning-paths/laptops-and-desktops/pytorch-finetuning-on-spark/1-setup.md @@ -6,6 +6,8 @@ weight: 2 layout: learningpathall --- +## Overview + The NVIDIA DGX Spark pairs an Arm-based Grace CPU with a Blackwell GPU in a compact desktop form factor. The GPU handles the compute-intensive training passes while the Grace CPU manages data preprocessing and orchestration, making the system well suited for fine-tuning large language models locally without sending data to the cloud. To get started, you'll configure Docker, pull a pre-built PyTorch container, and install the libraries you need for fine-tuning. @@ -39,7 +41,7 @@ Pull the latest PyTorch container from NVIDIA's container registry: docker pull nvcr.io/nvidia/pytorch:25.11-py3 ``` -This command downloads the November 2025 release of the PyTorch container, which includes PyTorch, CUDA libraries, cuDNN, and other essential tools pre-configured for optimal performance on NVIDIA hardware. The download size is several gigabytes, so this step might take a few minutes depending on your internet connection. +This command downloads the November 2025 release of the PyTorch container, which includes PyTorch, CUDA libraries, cuDNN, and other essential tools pre-configured for optimal performance on NVIDIA hardware. The download size is several gigabytes, so this step can take a few minutes depending on your internet connection. ## Launch container instance @@ -81,7 +83,7 @@ These packages serve specific purposes: - `trl` (Transformer Reinforcement Learning) includes training utilities and recipes for language models - `bitsandbytes` enables 4-bit and 8-bit quantization for memory-efficient training -The installation typically takes a few minutes as pip downloads and installs each package along with their dependencies. +The installation can take a few minutes as pip downloads and installs each package along with their dependencies. ## Authenticate with Hugging Face @@ -102,13 +104,11 @@ NVIDIA provides a collection of ready-to-use fine-tuning scripts optimized for D Clone the playbooks repository: ```bash -git clone https://github.com/NVIDIA/dgx-spark-playbooks -cd dgx-spark-playbooks -git checkout e51dae47ec9233ccd722dd465be87a984fd97d61 -cd nvidia/pytorch-fine-tune/assets +git clone https://github.com/mhall119/finetuning-scripts.git +cd finetuning-scripts/nvidia ``` -The repository contains scripts for different model architectures and training strategies. The `assets` directory includes the fine-tuning scripts you'll use in the next steps. Each script is preconfigured with sensible defaults but also accepts command-line arguments for customization. +The repository contains a fork of the scripts found in [NVIDIA's Playbook](https://github.com/NVIDIA/dgx-spark-playbooks/nvidia/pytorch-fine-tune/assets) including the fine-tuning scripts you'll use in the next steps. This script is preconfigured with sensible defaults but also accepts command-line arguments for customization. ## What you've accomplished and what's next @@ -117,6 +117,6 @@ In this section you: - Configured Docker permissions on DGX Spark - Pulled the NVIDIA PyTorch container and launched an interactive session - Installed fine-tuning libraries and authenticated with Hugging Face -- Cloned the DGX Spark playbooks repository +- Cloned the fine-tuning scripts repository In the next section, you'll learn how supervised fine-tuning works and what makes it effective for adapting pre-trained models to specific tasks. diff --git a/content/learning-paths/laptops-and-desktops/pytorch-finetuning-on-spark/2-finetuning.md b/content/learning-paths/laptops-and-desktops/pytorch-finetuning-on-spark/2-finetuning.md index 66249cffec..146c34b246 100644 --- a/content/learning-paths/laptops-and-desktops/pytorch-finetuning-on-spark/2-finetuning.md +++ b/content/learning-paths/laptops-and-desktops/pytorch-finetuning-on-spark/2-finetuning.md @@ -8,13 +8,13 @@ layout: learningpathall ## Why fine-tuning matters for domain knowledge -Pre-trained models like Llama 3.1 8B have broad language skills, but they don't know everything. Ask the base model about the maximum clock speed of the RP2350 microcontroller and it confidently answers "1.8 GHz," a completely fabricated number. The actual specification is 150 MHz. +Pre-trained models like Llama 3.2 8B have broad language skills, but they don't know everything. Ask the base model about the maximum clock speed of the RP2350 microcontroller and it confidently answers "1.8 GHz," a completely fabricated number. The actual specification is 150 MHz. Fine-tuning fixes this by training the model on real data from Raspberry Pi datasheets. After fine-tuning, the same model answers correctly: "The RP2350 supports up to 150 MHz." No hallucination, no guessing. The process breaks down into three steps: -1. Patch the NVIDIA playbook's fine-tuning script to load a custom dataset, then run training +1. Run the fine-tuning script with a custom dataset about Raspberry Pi hardware 2. Serve both the original and fine-tuned models using vLLM 3. Compare the outputs side by side to see factual accuracy improve @@ -36,7 +36,7 @@ The NVIDIA playbook scripts use the Alpaca prompt format, which structures each **Input** -- optional additional context (left empty for most questions) -**Output** -- the correct answer sourced from official datasheets +**output** -- the correct answer sourced from official datasheets Here's an example from the dataset you'll use: @@ -74,7 +74,7 @@ Not every model fits entirely in GPU memory during training. The fine-tuning scr **QLoRA (Quantized LoRA)** goes a step further by loading the frozen model weights in 4-bit precision. Combined with LoRA's parameter-efficient training, this lets you fine-tune 70B-class models that would otherwise exceed available memory. -The script you'll run in the next section uses full fine-tuning by default, but the playbook includes LoRA and QLoRA scripts for larger models. +The script you'll run in the next section uses full fine-tuning by default, but NVIDIA's playbook includes LoRA and QLoRA scripts for larger models. ## What you've accomplished and what's next @@ -84,4 +84,4 @@ In this section you learned: - How Raspberry Pi datasheet Q&A pairs are structured in the Alpaca prompt format - The differences between full fine-tuning, LoRA, and QLoRA -In the next section, you'll walk through the fine-tuning script, patch it to load the Raspberry Pi dataset, and run training to produce your own fine-tuned model. +In the next section, you'll walk through the fine-tuning script, load the Raspberry Pi dataset, and run training to produce your own fine-tuned model. diff --git a/content/learning-paths/laptops-and-desktops/pytorch-finetuning-on-spark/3-pytorch.md b/content/learning-paths/laptops-and-desktops/pytorch-finetuning-on-spark/3-pytorch.md index ba6bf13e8a..86bfb6bb7b 100644 --- a/content/learning-paths/laptops-and-desktops/pytorch-finetuning-on-spark/3-pytorch.md +++ b/content/learning-paths/laptops-and-desktops/pytorch-finetuning-on-spark/3-pytorch.md @@ -6,20 +6,10 @@ weight: 4 layout: learningpathall --- -Now that you understand how fine-tuning works, it's time to look at the actual code. In this section, you'll walk through the key parts of the NVIDIA playbook's fine-tuning script, patch it to load the Raspberry Pi dataset, and run it to produce your own fine-tuned Llama model. +## Overview -## Review the fine-tuning scripts +Now that you understand how fine-tuning works, it's time to look at the actual code. In this section, you'll walk through the key parts of the `Llama3_3B_full_finetuning.py` script and run it with the Raspberry Pi dataset to produce your own fine-tuned Llama model. -The NVIDIA playbook provides four main fine-tuning scripts, each designed for different scenarios: - -| Script | Approach | Best for | -|--------|----------|----------| -| `Llama3_3B_full_finetuning.py` | Full fine-tuning (all parameters) | Smaller models where GPU memory isn't a constraint | -| `Llama3_8B_LoRA_finetuning.py` | LoRA (frozen base + small trainable adapters) | Mid-size models with reduced memory needs | -| `Llama3_70B_LoRA_finetuning.py` | LoRA + FSDP (distributed across GPUs) | Large models that need multi-GPU sharding | -| `Llama3_70B_qLoRA_finetuning.py` | QLoRA (LoRA + 4-bit quantization) | Very large models on memory-limited systems | - -The file names refer to the default model each script uses, but you can pass a different model on the command line. This Learning Path uses `Llama3_3B_full_finetuning.py`. The key sections of that script are explained below. ## Imports and dataset preparation @@ -33,35 +23,37 @@ from trl import SFTConfig, SFTTrainer from transformers import AutoModelForCausalLM, AutoTokenizer ``` -The `ALPACA_PROMPT_TEMPLATE` defines the instruction-following format for training data with three fields: instruction, input, and response. Each training example is formatted using this template so the model learns to recognize the pattern and produce structured answers. +The `DATASET_PROMPT_TEMPLATE` defines the instruction-following format for training data with three fields: instruction, input, and response. Each training example is formatted using this template so the model learns to recognize the pattern and produce structured answers. -The `get_alpaca_dataset()` function loads the Alpaca dataset from Hugging Face by default and formats each example using the template, appending the EOS (End of String) token. You'll patch this function later to load the Raspberry Pi dataset from a local JSONL file instead. +The `get_dataset()` function loads the dataset from Hugging Face by default and formats each example using the template, appending the EOS (End of Sequence) token. You'll pass in the Raspberry Pi dataset from a local JSONL file instead. ```python # Define prompt templates -ALPACA_PROMPT_TEMPLATE = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. +DATASET_PROMPT_TEMPLATE = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: {} ### Input: {} ### Response: {}""" -def get_alpaca_dataset(eos_token, dataset_size=500): +def get_dataset(dataset_name, dataset_dir, dataset_files, eos_token, dataset_size=512): # Preprocess the dataset def preprocess(x): texts = [ - ALPACA_PROMPT_TEMPLATE.format(instruction, input, output) + eos_token + DATASET_PROMPT_TEMPLATE.format(instruction, input, output) + eos_token for instruction, input, output in zip(x["instruction"], x["input"], x["output"]) ] return {"text": texts} - dataset = load_dataset("tatsu-lab/alpaca", split="train").select(range(dataset_size)).shuffle(seed=42) + dataset = load_dataset(dataset_name, data_dir=dataset_dir, data_files=dataset_files, split="train") + if len(dataset) > dataset_size: + dataset = dataset.select(range(dataset_size)).shuffle(seed=42) return dataset.map(preprocess, remove_columns=dataset.column_names, batched=True) ``` ## Model and tokenizer loading -The `from_pretrained()` method downloads and initializes a pre-trained language model from Hugging Face. The tokenizer is loaded alongside it, with the padding token set to match the EOS token (required for batched training). +The `from_pretrained()` method downloads and initializes a pre-trained language model (from Hugging Face by default). The tokenizer is loaded alongside it, with the padding token set to match the EOS token (required for batched training). ```python # Load the model and tokenizer @@ -78,12 +70,12 @@ The `from_pretrained()` method downloads and initializes a pre-trained language ## Dataset loading -With the model and tokenizer loaded, the script prepares the training data by calling `get_alpaca_dataset()` with the tokenizer's EOS token and the specified dataset size. By default the script downloads the Alpaca dataset from Hugging Face, but you'll patch this function to load the Raspberry Pi JSONL file instead. +With the model and tokenizer loaded, the script prepares the training data by calling `get_dataset()` with the tokenizer's EOS token and the specified dataset size. By default the script downloads the Alpaca dataset from Hugging Face, but you'll pass in the Raspberry Pi JSONL file from the command-line instead. ```python # Load and preprocess the dataset print(f"Loading dataset with {args.dataset_size} samples...") - dataset = get_alpaca_dataset(tokenizer.eos_token, args.dataset_size) + dataset = get_dataset(args.dataset, args.dataset_dir, args.dataset_files, tokenizer.eos_token, args.dataset_size) ``` ## Training configuration @@ -94,7 +86,7 @@ The training configuration controls how the SFT process runs. Notable parameters # Configure the SFT config config = { "per_device_train_batch_size": args.batch_size, - "num_train_epochs": 0.01, # Warmup epoch + "num_train_epochs": 0.05, # Warmup epoch "gradient_accumulation_steps": args.gradient_accumulation_steps, "learning_rate": args.learning_rate, "optim": "adamw_torch", @@ -104,7 +96,6 @@ The training configuration controls how the SFT process runs. Notable parameters "dataset_text_field": "text", "packing": False, "max_length": args.seq_length, - "torch_compile": False, "report_to": "none", "logging_dir": args.log_dir, "logging_steps": args.logging_steps, @@ -114,22 +105,21 @@ The training configuration controls how the SFT process runs. Notable parameters ## Model compilation and training -If `torch.compile()` is enabled, the script first optimizes the model graph for faster execution on the hardware. A short warmup pass (0.01 epochs) triggers compilation so the overhead doesn't affect the actual training run. After warmup, the script creates an `SFTTrainer` with the full epoch count and calls `trainer.train()`. The returned `trainer_stats` object contains metrics like loss, throughput, and training time. +The script first optimizes the model graph for faster execution on the hardware using the `torch.compile()` function. A short warmup pass (0.05 epochs) triggers compilation so the overhead doesn't affect the actual training run. After warmup, the script creates an `SFTTrainer` with the full epoch count and calls `trainer.train()`. The returned `trainer_stats` object contains metrics like loss, throughput, and training time. ```python - # Compile model if requested - if args.use_torch_compile: - print("Compiling model with torch.compile()...") - model = torch.compile(model) - - # Warmup for torch compile - print("Running warmup for torch.compile()...") - SFTTrainer( - model=model, - processing_class=tokenizer, - train_dataset=dataset, - args=SFTConfig(**config), - ).train() + # Compile model for faster training + print("Compiling model with torch.compile()...") + model = torch.compile(model) + + # Warmup for torch compile + print("Running warmup for torch.compile()...") + SFTTrainer( + model=model, + processing_class=tokenizer, + train_dataset=dataset, + args=SFTConfig(**config), + ).train() # Train the model print(f"\nStarting full fine-tuning for {args.num_epochs} epoch(s)...") @@ -146,9 +136,9 @@ If `torch.compile()` is enabled, the script first optimizes the model graph for trainer_stats = trainer.train() ``` -## Patch the script for the Raspberry Pi dataset +## Download the Raspberry Pi dataset -The script loads the Alpaca dataset from Hugging Face by default. You need to patch the dataset loading function to use the local Raspberry Pi JSONL file instead. +The script loads the Alpaca dataset from Hugging Face by default. You need to point it to the local Raspberry Pi JSONL dataset file instead. First, open a new terminal on the DGX Spark (not inside the container) and navigate to the directory where you launched the Docker container. This is the directory that gets mounted as `/workspace` inside the container. Download the dataset file: @@ -164,34 +154,17 @@ Back inside the container, copy the dataset into the script's working directory: cp /workspace/raspberry_pi_qa.jsonl . ``` -The following `sed` command replaces the `get_alpaca_dataset()` function to load from a local JSONL file instead of Hugging Face. The replacement function reads the Raspberry Pi Q&A pairs and formats them using the same Alpaca prompt template: - -```bash -sed -i '/^def get_alpaca_dataset/,/^ return dataset\.map/c\ -def get_alpaca_dataset(eos_token, dataset_size=500):\ - def preprocess(x):\ - texts = [\ - ALPACA_PROMPT_TEMPLATE.format(instruction, inp, output) + eos_token\ - for instruction, inp, output in zip(x["instruction"], x["input"], x["output"])\ - ]\ - return {"text": texts}\ - dataset = load_dataset("json", data_files="raspberry_pi_qa.jsonl", split="train")\ - if len(dataset) > dataset_size:\ - dataset = dataset.select(range(dataset_size))\ - return dataset.map(preprocess, remove_columns=dataset.column_names, batched=True)' Llama3_3B_full_finetuning.py -``` - -The key difference is `load_dataset("json", data_files="raspberry_pi_qa.jsonl", split="train")`, which reads the local file instead of downloading from Hugging Face. The function still applies the same Alpaca prompt template and EOS token. - ## Run the fine-tuning -With the dataset patch applied, you're ready to run the fine-tuning. The command below trains the Llama 3.1 8B model using full fine-tuning on the Raspberry Pi dataset: +With the dataset in place, you're ready to run the fine-tuning. The command below trains the Llama 3.2 3B model using full fine-tuning on the Raspberry Pi dataset: ```bash python Llama3_3B_full_finetuning.py \ ---model_name "meta-llama/Llama-3.1-8B" \ +--model_name "meta-llama/Llama-3.2-3B-Instruct" \ +--dataset "json" \ +--dataset_files="raspberry_pi_qa.jsonl" \ --dataset_size 300 \ ---output_dir "/workspace/models/Llama-3.1-8B-FineTuned" +--output_dir "/workspace/models/Llama-3.2-3B-FineTuned" ``` The `--dataset_size 300` flag tells the script to use all entries in the Raspberry Pi dataset (the default is 500, but a smaller, focused dataset can be more effective than a larger generic one). The `--output_dir` flag saves the fine-tuned model and tokenizer to the specified directory. Because you mounted your current directory into the container with `-v ${PWD}:/workspace`, the saved model is also accessible from the host system. @@ -202,9 +175,8 @@ Training takes a few minutes on DGX Spark. When it completes, you'll see a summa In this section you: -- Reviewed the available fine-tuning scripts and their approaches - Walked through each stage of the full fine-tuning script -- Patched the dataset loading function to use Raspberry Pi datasheet Q&A pairs +- Downloaded the Raspberry Pi datasheet Q&A pairs - Ran full fine-tuning and saved the resulting model with `--output_dir` In the next section, you'll serve both the original and fine-tuned models and compare their responses to Raspberry Pi hardware questions. \ No newline at end of file diff --git a/content/learning-paths/laptops-and-desktops/pytorch-finetuning-on-spark/4-testing.md b/content/learning-paths/laptops-and-desktops/pytorch-finetuning-on-spark/4-testing.md index 8a5ba06665..19300fcef7 100644 --- a/content/learning-paths/laptops-and-desktops/pytorch-finetuning-on-spark/4-testing.md +++ b/content/learning-paths/laptops-and-desktops/pytorch-finetuning-on-spark/4-testing.md @@ -6,6 +6,8 @@ weight: 5 layout: learningpathall --- +## Overview + Now that you've fine-tuned your model on Raspberry Pi datasheet content, it's time to compare its behavior against the original. You'll serve both versions using vLLM, a high-performance inference server optimized for large language models, and observe how fine-tuning on domain-specific data changes the model's factual accuracy. ## Download vLLM container @@ -43,11 +45,11 @@ Before testing your fine-tuned model, first observe how the original, unmodified ### Launch vLLM -Start the vLLM server with the original Llama 3.1 8B model: +Start the vLLM server with the original Llama 3.2 3B Instruct model: ```bash python3 -m vllm.entrypoints.openai.api_server \ ---model "meta-llama/Llama-3.1-8B" --trust-remote-code \ +--model "meta-llama/Llama-3.2-3B-Instruct" --trust-remote-code \ --tensor-parallel-size 1 --quantization fp8 \ --gpu-memory-utilization 0.80 ``` @@ -64,8 +66,7 @@ Wait for the server to fully load the model and display the message indicating i ### Test prompt - -From a new terminal window (outside the container), send a Raspberry Pi hardware question to the model using the Alpaca instruction format. For this example, use a question about the memory size: +From a new terminal window (outside the container), use `curl` to send an HTTP request to the model server. The request contains a Raspberry Pi hardware question formatted using the Alpaca instruction template. For this example, ask about the RP2350 memory size: ```bash curl http://localhost:8000/v1/completions \ @@ -87,7 +88,7 @@ The original model hallucinates an incorrect specification. The output is simila "id": "cmpl-91e070e2a34aaf01", "object": "text_completion", "created": 1770998840, - "model": "meta-llama/Llama-3.1-8B", + "model": "meta-llama/Llama-3.2-3B-Instruct", "choices": [ { "index": 0, @@ -109,11 +110,11 @@ The base model confidently reports the RP2350 has "256MB of memory," which is of Now test your fine-tuned model to see how training on Raspberry Pi datasheet content improved its factual accuracy. Stop the current vLLM server (press Ctrl+C in the container terminal) before launching the fine-tuned model. -{{% notice Dependency Conflict %}} +{{% notice Note %}} As of this writing, vLLM does not support version 5 of the `transformers` library that was used when fine-tuning the model, so you need to patch its `tokenizer_config.json`. Run the following command to update the `tokenizer_class` to `PreTrainedTokenizerFast`, which is compatible with the older `transformers` version bundled in the vLLM container: ```bash -sed -i 's/"tokenizer_class": "TokenizersBackend"/"tokenizer_class": "PreTrainedTokenizerFast"/' /workspace/models/Llama-3.1-8B-FineTuned/tokenizer_config.json +sed -i 's/"tokenizer_class": "TokenizersBackend"/"tokenizer_class": "PreTrainedTokenizerFast"/' /workspace/models/Llama-3.2-3B-FineTuned/tokenizer_config.json ``` {{% /notice %}} @@ -123,7 +124,7 @@ Start the vLLM server with your fine-tuned model: ```bash python3 -m vllm.entrypoints.openai.api_server \ ---model "/workspace/models/Llama-3.1-8B-FineTuned" --trust-remote-code \ +--model "/workspace/models/Llama-3.2-3B-FineTuned" --trust-remote-code \ --tensor-parallel-size 1 --quantization fp8 \ --gpu-memory-utilization 0.80 ``` @@ -132,7 +133,7 @@ The only change from the previous command is the `--model` parameter, which now ### Test prompt -Send the same Raspberry Pi question to your fine-tuned model: +Send the same Raspberry Pi question to your fine-tuned model using the same `curl` command format: ```bash curl http://localhost:8000/v1/completions \ @@ -152,7 +153,7 @@ The fine-tuned model produces a correct, datasheet-accurate response. The output "id": "cmpl-bad36ff5edddfb74", "object": "text_completion", "created": 1770999123, - "model": "/workspace/models/Llama-3.1-8B-FineTuned", + "model": "/workspace/models/Llama-3.2-3B-FineTuned", "choices": [ { "index": 0, diff --git a/content/learning-paths/laptops-and-desktops/pytorch-finetuning-on-spark/_index.md b/content/learning-paths/laptops-and-desktops/pytorch-finetuning-on-spark/_index.md index fe8a3f9311..7382d29d57 100644 --- a/content/learning-paths/laptops-and-desktops/pytorch-finetuning-on-spark/_index.md +++ b/content/learning-paths/laptops-and-desktops/pytorch-finetuning-on-spark/_index.md @@ -1,10 +1,6 @@ --- title: Fine-tune PyTorch models on DGX Spark -draft: true -cascade: - draft: true - minutes_to_complete: 60 who_is_this_for: This is an advanced topic for AI developers and ML engineers who want to fine-tune large language models using PyTorch and Hugging Face on the NVIDIA DGX Spark platform. @@ -12,7 +8,7 @@ who_is_this_for: This is an advanced topic for AI developers and ML engineers wh learning_objectives: - Understand how fine-tuning teaches a model domain-specific knowledge - Prepare a custom JSONL dataset for supervised fine-tuning - - Fine-tune Llama 3.1 8B on Raspberry Pi datasheet content using PyTorch and Hugging Face + - Fine-tune Llama 3.2 3B on Raspberry Pi datasheet content using PyTorch and Hugging Face - Compare base and fine-tuned model responses to verify factual accuracy improvements prerequisites: @@ -31,6 +27,7 @@ tools_software_languages: - Python - PyTorch - Docker + - Hugging Face operatingsystems: - Linux @@ -55,6 +52,10 @@ further_reading: title: PyTorch Training Documentation link: https://pytorch.org/tutorials/beginner/introyt/trainingyt.html type: documentation + - resource: + title: Build a serverless LLM inference application with AWS Lambda and Arm processors + link: /learning-paths/servers-and-cloud-computing/llama-cpu/ + type: website ### FIXED, DO NOT MODIFY # ================================================================================ diff --git a/content/learning-paths/laptops-and-desktops/win_profile_guided_optimisation/_index.md b/content/learning-paths/laptops-and-desktops/win_profile_guided_optimisation/_index.md index e266342516..e75bc5c728 100644 --- a/content/learning-paths/laptops-and-desktops/win_profile_guided_optimisation/_index.md +++ b/content/learning-paths/laptops-and-desktops/win_profile_guided_optimisation/_index.md @@ -1,34 +1,33 @@ --- -title: Optimize C++ performance with Profile-Guided Optimization and Google Benchmark +title: Optimize C++ applications on Windows on Arm using Profile-Guided Optimization -draft: true -cascade: - draft: true +minutes_to_complete: 30 -minutes_to_complete: 15 - -who_is_this_for: Developers looking to optimize C++ performance on an Arm-based Windows device, based on runtime behavior. +who_is_this_for: This is an introductory topic for software developers who want to optimize C++ application performance on Windows on Arm using Profile-Guided Optimization (PGO). learning_objectives: - - Microbenchmark a function using Google Benchmark. - - Apply profile-guided optimization to build performance-tuned binaries for Windows on Arm. + - Microbenchmark a function using Google Benchmark + - Apply profile-guided optimization to build performance-tuned binaries for Windows on Arm + - Measure and compare performance improvements from PGO-optimized builds prerequisites: - - Basic C++ understanding. - - Access to an Arm-based Windows machine. + - Familiarity with C++ development and compiling programs from the command line + - A Windows on Arm machine with [Visual Studio](/install-guides/vs-woa/) and the C++ desktop development tools installed author: Tom Dunkle ### Tags skilllevels: Introductory -subjects: ML -armips: - - Neoverse +subjects: Performance and Architecture tools_software_languages: + - C + - MSVC - Google Benchmark - - Runbook + - PGO operatingsystems: - Windows +armips: + - Cortex-A further_reading: - resource: @@ -39,6 +38,14 @@ further_reading: title: Google Benchmark Library link: https://github.com/google/benchmark type: documentation + - resource: + title: Windows on Arm developer documentation + link: https://learn.microsoft.com/en-us/windows/arm/overview + type: documentation + - resource: + title: Arm performance optimization resources + link: https://learn.arm.com/learning-paths/laptops-and-desktops/ + type: website diff --git a/content/learning-paths/laptops-and-desktops/win_profile_guided_optimisation/how-to-1.md b/content/learning-paths/laptops-and-desktops/win_profile_guided_optimisation/how-to-1.md index d8d2a1739a..79fbc09f46 100644 --- a/content/learning-paths/laptops-and-desktops/win_profile_guided_optimisation/how-to-1.md +++ b/content/learning-paths/laptops-and-desktops/win_profile_guided_optimisation/how-to-1.md @@ -1,26 +1,45 @@ --- -title: Profile-Guided Optimization +title: Understand Profile-Guided Optimization weight: 2 ### FIXED, DO NOT MODIFY layout: learningpathall --- -### What is Profile-Guided Optimization (PGO) and how does it work? +## What is Profile-Guided Optimization? -Profile-Guided Optimization (PGO) is a compiler optimization technique that enhances program performance by utilizing real-world execution data. PGO typically involves a two-step process: +Profile-Guided Optimization (PGO) is a compiler optimization technique that enhances your program's performance by using real-world execution data. PGO works in two steps: -- First, compile the program to produce an instrumented binary that collects profiling data during execution; -- Second, recompile the program with an optimization profile, allowing the compiler to leverage the collected data to make informed optimization decisions. This approach identifies frequently executed paths — known as “hot” paths — and optimizes them more aggressively, while potentially reducing emphasis on less critical code paths. +First, you compile your program to produce an instrumented binary that collects profiling data during execution. Second, you recompile the program with this optimization profile, allowing the compiler to make informed optimization decisions based on the collected data. -### When should I use Profile-Guided Optimization? +This approach identifies frequently executed paths (known as "hot" paths) and optimizes them more aggressively, while reducing emphasis on less critical code paths. -PGO is particularly beneficial in the later stages of development when real-world workloads are available. It is especially useful for applications where performance is critical and runtime behavior is complex or data-dependent. For instance, consider optimizing “hot” functions that execute frequently. Doing so ensures that the most impactful parts of your code are optimized based on actual usage patterns. +## How PGO improves performance on Windows on Arm -### What are the limitations of Profile-Guided Optimization and when should I avoid it? +PGO enables several compiler optimizations that aren't possible with static analysis alone. -While PGO offers substantial performance benefits, it has limitations. The profiling data must accurately represent typical usage scenarios; otherwise, the optimizations may not deliver the desired performance improvements and could even degrade performance. +Code layout optimization arranges frequently executed code together in memory, improving instruction cache utilization and reducing branch mispredictions. Instead of using heuristics for inlining decisions, the compiler inlines functions based on actual call frequency and execution patterns from your profiling data. -Additionally, the process requires extra build steps, potentially increasing compile times for large codebases. Therefore, use PGO only on performance-critical sections that are heavily influenced by actual runtime behavior. PGO might not be ideal for early-stage development or applications with highly variable or unpredictable usage patterns. +For C++ virtual functions, PGO can identify the most common call targets and optimize or devirtualize those paths. The compiler can also eliminate dead code by optimizing differently or removing code paths that never execute in your profiling runs. -For further information, see the [MSVC documentation](https://learn.microsoft.com/en-us/cpp/build/profile-guided-optimizations?view=msvc-170) on enabling and using PGO. +If your application has an error handling path that rarely executes, PGO ensures the compiler doesn't optimize for that path at the expense of your main execution flow. Performance improvements typically range from 5-15%, though results vary by workload and architecture. + +## When to use PGO on Arm + +You'll find PGO particularly beneficial in the later stages of development when real-world workloads are available. It's especially useful for applications where performance is critical and runtime behavior is complex or data-dependent. + +For example, consider optimizing "hot" functions that execute frequently. By doing so, you ensure that the most impactful parts of your code are optimized based on actual usage patterns. + +## Limitations of Profile-Guided Optimization + +While PGO offers substantial performance benefits, it has some limitations to keep in mind. + +Your profiling data must accurately represent typical usage scenarios. If it doesn't, the optimizations might not deliver the desired performance improvements and could even degrade performance. + +Additionally, the process requires extra build steps, which can increase compile times for large codebases. Use PGO on performance-critical sections that are heavily influenced by actual runtime behavior. + +PGO might not be ideal for early-stage development or applications with highly variable or unpredictable usage patterns. + +## What you've accomplished and what's next + +You now understand what PGO is, how it improves performance through code layout and inlining optimizations, and when to apply it. In the next section, you'll learn about Google Benchmark, the tool you'll use to measure these performance improvements. diff --git a/content/learning-paths/laptops-and-desktops/win_profile_guided_optimisation/how-to-2.md b/content/learning-paths/laptops-and-desktops/win_profile_guided_optimisation/how-to-2.md index 16bfb7cb3c..6b6e0e610a 100644 --- a/content/learning-paths/laptops-and-desktops/win_profile_guided_optimisation/how-to-2.md +++ b/content/learning-paths/laptops-and-desktops/win_profile_guided_optimisation/how-to-2.md @@ -1,20 +1,26 @@ --- -title: Google Benchmark +title: Understand Google Benchmark basics weight: 3 ### FIXED, DO NOT MODIFY layout: learningpathall --- -## Google Benchmark +## Overview -Google Benchmark is a C++ library specifically designed for microbenchmarking – measuring the performance of small code snippets with high accuracy. Microbenchmarking is essential for identifying bottlenecks and optimizing critical sections, especially in performance-sensitive applications. +Before you start working with Profile-Guided Optimization, you need to understand how to measure performance. This section introduces Google Benchmark, the tool you'll use to measure the impact of your optimizations. Don't worry about installing anything yet. You'll set up your environment and run your first benchmark in the next section. -Google Benchmark simplifies this process by providing a framework that manages iterations, times execution, and performs statistical analysis. This allows you to focus on the code being measured, rather than writing boilerplate or trying to prevent unwanted compiler optimizations manually. +## What is Google Benchmark? -To use Google Benchmark, define a function that accepts a `benchmark::State&` parameter and iterate over it to perform the benchmarking. Register the function using the `BENCHMARK` macro and include `BENCHMARK_MAIN()` to generate the benchmark's entry point. +Google Benchmark is a C++ library specifically designed for microbenchmarking, which means measuring the performance of small code snippets with high accuracy. Microbenchmarking helps you identify bottlenecks and optimize critical sections, especially in performance-sensitive applications. -Here's a basic example: +Google Benchmark simplifies this process by providing a framework that manages iterations, times execution, and performs statistical analysis. You can focus on the code being measured, rather than writing test code or trying to prevent unwanted compiler optimizations manually. + +## Write a simple benchmark + +To use Google Benchmark, define a function that accepts a benchmark::State& parameter and use it to run the benchmark in a loop. You register the function using the `BENCHMARK` macro and include `BENCHMARK_MAIN()` to generate the benchmark's entry point. + +The following example shows a basic benchmark that measures the time it takes to create an empty string. A minimal benchmark looks like this: ```cpp #include @@ -28,18 +34,20 @@ BENCHMARK(BM_StringCreation); BENCHMARK_MAIN(); ``` -### Filtering and Preventing Compiler Optimizations +## Control benchmark execution Google Benchmark provides tools to ensure accurate measurements by preventing unintended compiler optimizations and allowing flexible benchmark selection. -1. **Preventing Optimizations**: Use `benchmark::DoNotOptimize(value);` to force the compiler to read and store a variable or expression, ensuring it is not optimized away. - -2. **Filtering Benchmarks**: To run a specific subset of benchmarks, use the `--benchmark_filter` command-line option with a regular expression. For example: +To prevent the compiler from optimizing away your code, use `benchmark::DoNotOptimize(value);` to force the compiler to read and store a variable or expression. This ensures your benchmark actually measures what you intend to measure. + +When you have multiple benchmarks, you can run a specific subset using the `--benchmark_filter` command-line option with a regular expression. This example runs all benchmarks that start with "BM_String": + +```console +.\benchmark_binary --benchmark_filter=BM_String.* +``` + +Filtering eliminates the need to repeatedly comment out lines of source code when you want to focus on specific benchmarks. - ```bash - .\benchmark_binary --benchmark_filter=BM_String.* - ``` - -This eliminates the need to repeatedly comment out lines of source code. +## What you've accomplished and what's next -For more detailed information and advanced usage, refer to the [official documentation](https://github.com/google/benchmark). \ No newline at end of file +You now understand how to write basic benchmarks with Google Benchmark, use `benchmark::DoNotOptimize` to prevent unwanted compiler optimizations, and filter benchmark execution with command-line options. In the next section, you'll install Google Benchmark and create a baseline benchmark to measure division performance on Windows on Arm. diff --git a/content/learning-paths/laptops-and-desktops/win_profile_guided_optimisation/how-to-3.md b/content/learning-paths/laptops-and-desktops/win_profile_guided_optimisation/how-to-3.md index d1a1afd94c..1353d38a4e 100644 --- a/content/learning-paths/laptops-and-desktops/win_profile_guided_optimisation/how-to-3.md +++ b/content/learning-paths/laptops-and-desktops/win_profile_guided_optimisation/how-to-3.md @@ -1,38 +1,42 @@ --- -title: Example operation +title: Create a baseline benchmark weight: 4 ### FIXED, DO NOT MODIFY layout: learningpathall --- -## Optimizing costly division operations with Google Benchmark and PGO +## Overview -In this section, you'll learn how to use Google Benchmark and Profile-Guided Optimization to improve the performance of a simple division operation. This example demonstrates how even seemingly straightforward operations can benefit from optimization techniques. +In this section, you'll create a baseline benchmark to measure the performance of a division operation. This baseline allows you to measure the improvement when you apply profile-guided optimization in the next section. -Integer division is ideal for benchmarking because it's significantly more expensive than operations like addition, subtraction, or multiplication. On most CPU architectures, including Arm, division instructions have higher latency and lower throughput compared to other arithmetic operations. By applying Profile-Guided Optimization to code containing division operations, we can potentially achieve significant performance improvements. +Integer division is ideal for benchmarking because it's significantly more expensive than operations like addition, subtraction, or multiplication. On most CPU architectures, including Arm, division instructions have higher latency and lower throughput compared to other arithmetic operations. By applying Profile-Guided Optimization to code containing division operations, you can achieve significant performance improvements. -For this example, you can use an Arm computer running Windows. +For this example, you'll use an Arm computer running Windows. -## What tools are needed to run a Google Benchmark example on Windows? +## Set up Google Benchmark on Windows on Arm -Download the [Arm GNU Toolchain](https://developer.arm.com/Tools%20and%20Software/GNU%20Toolchain) to install the prerequisite packages. +Before you can run benchmarks, you need to install vcpkg (a C++ package manager) and Google Benchmark. This is a one-time setup step. -Next, install the static version of Google Benchmark for Arm64 via vcpkg. Run the following commands in Powershell as Administrator: +### Install vcpkg and Google Benchmark + +The following commands download and initialize vcpkg, create a project directory, and install Google Benchmark for Windows on Arm: ```console -cd C:\git -git clone https://github.com/microsoft/vcpkg.git -cd vcpkg -.\bootstrap-vcpkg.bat -.\vcpkg install benchmark:arm64-windows-static +iex (iwr -useb https://aka.ms/vcpkg-init.ps1) +cd $HOME +mkdir pgo-benchmark +cd pgo-benchmark +& "$HOME\.vcpkg\vcpkg.exe" new --application +& "$HOME\.vcpkg\vcpkg.exe" add port benchmark +& "$HOME\.vcpkg\vcpkg.exe" install ``` -## Division example +## Create the division benchmark Use an editor to copy and paste the C++ source code below into a file named `div_bench.cpp`. -This trivial example takes in a vector of 4096 32-bit integers and divides each element by a number. Importantly, the use of `benchmark/benchmark.h` introduces indirection since the divisor value is unknown at compile time, although it is visible in the source code as 1500. +This example takes a vector of 4096 32-bit integers and divides each element by a number. The key detail here is that the divisor value is passed through `s.range(0)`, making it unknown at compile time. This prevents the compiler from applying optimizations like strength reduction, which means PGO will have an opportunity to make a real difference. ```cpp #include @@ -54,24 +58,39 @@ BENCHMARK(baseDiv)->Arg(1500)->Unit(benchmark::kMicrosecond); // value of 1500 i BENCHMARK_MAIN(); ``` -To compile and run the microbenchmark on this function, you need to link with the correct libraries: +## Compile the baseline benchmark with MSVC -Compile with the command: +Open an **ARM64 Native Tools Command Prompt** from the Windows Start menu and start PowerShell: ```console -cl /D BENCHMARK_STATIC_DEFINE div_bench.cpp /link /LIBPATH:"$VCPKG\lib" benchmark.lib benchmark_main.lib shlwapi.lib +powershell ``` -Run the program: +Set an environment variable to refer to the vcpkg-installed package directory for the ARM64 Windows target. This simplifies the compiler commands that follow: ```console +$VCPKG="$HOME\pgo-benchmark\vcpkg_installed\arm64-windows" +``` + +Compile the benchmark. This command uses the MSVC compiler and links with the Google Benchmark libraries: + +```console +cl /I"$VCPKG\include" /D BENCHMARK_STATIC_DEFINE div_bench.cpp /link /LIBPATH:"$VCPKG\lib" benchmark.lib benchmark_main.lib shlwapi.lib +``` + +## Run the benchmark + +Add the vcpkg binary directory to your PATH so the program can find required DLLs, then run the benchmark: + +```console +$env:PATH += ";$HOME\pgo-benchmark\vcpkg_installed\arm64-windows\bin" .\div_bench.exe ``` -### Example output +The output is similar to: ```output -Running ./div_bench.base +Running ./div_bench.exe Run on (4 X 2100 MHz CPU s) CPU Caches: L1 Data 64 KiB (x4) @@ -85,3 +104,9 @@ Benchmark Time CPU Iterations ------------------------------------------------------- baseDiv/1500 7.90 us 7.90 us 88512 ``` + +The warning appears because the Google Benchmark library was built in debug mode, but it doesn't affect the validity of the measurements for this example. + +## What you've accomplished and what's next + +You've set up Google Benchmark on Windows on Arm, created a division-heavy benchmark, and established a baseline performance measurement of 7.90 microseconds. This baseline gives you a clear reference point to measure the impact of Profile-Guided Optimization. In the next section, you'll apply PGO to this code and measure the performance improvement. \ No newline at end of file diff --git a/content/learning-paths/laptops-and-desktops/win_profile_guided_optimisation/how-to-4.md b/content/learning-paths/laptops-and-desktops/win_profile_guided_optimisation/how-to-4.md index 3a29d662e7..d796da0353 100644 --- a/content/learning-paths/laptops-and-desktops/win_profile_guided_optimisation/how-to-4.md +++ b/content/learning-paths/laptops-and-desktops/win_profile_guided_optimisation/how-to-4.md @@ -1,74 +1,62 @@ --- -title: Using Profile Guided Optimization (Windows) +title: Apply Profile-Guided Optimization weight: 5 ### FIXED, DO NOT MODIFY layout: learningpathall --- -### Build with PGO +## Overview -To generate a binary optimized using runtime profile data, first build an instrumented binary that records usage data. Before building, open the Arm dev shell so that the compiler is in your PATH: +Now that you have a baseline benchmark, you're ready to apply Profile-Guided Optimization. The PGO process involves three steps: build an instrumented binary, run it to collect profile data, and rebuild with optimizations based on that data. -```console -& "C:\Program Files\Microsoft Visual Studio\18\Community\Common7\Tools\Launch-VsDevShell.ps1" -Arch arm64 -``` - -(**note:** you may need to change the version number in your Visual Studio path, depending on which Visual Studio version you've installed.) +## Build instrumented binary with MSVC -Next, set an environment variable to refer to the installed packages directory: +Open an **ARM64 Native Tools Command Prompt** from the Windows Start menu and start PowerShell if it's not already open. If you're starting a new session, navigate to your project directory and set the `$VCPKG` environment variable again: ```console -$VCPKG="C:\git\vcpkg\installed\arm64-windows-static" +powershell +cd $HOME\pgo-benchmark +$VCPKG="$HOME\pgo-benchmark\vcpkg_installed\arm64-windows" ``` -Next, run the following command, which includes the `/GENPROFILE` flag, to build the instrumented binary: +Build the instrumented binary with the `/GENPROFILE` flag. This creates a version of your program that records how it executes: ```console cl /O2 /GL /D BENCHMARK_STATIC_DEFINE /I "$VCPKG\include" /Fe:div_bench.exe div_bench.cpp /link /LTCG /GENPROFILE /PGD:div_bench.pgd /LIBPATH:"$VCPKG\lib" benchmark.lib benchmark_main.lib shlwapi.lib ``` -The compiler options used in this command are: - -* **/O2**: Creates [fast code](https://learn.microsoft.com/en-us/cpp/build/reference/o1-o2-minimize-size-maximize-speed?view=msvc-170) -* **/GL**: Enables [whole program optimization](https://learn.microsoft.com/en-us/cpp/build/reference/gl-whole-program-optimization?view=msvc-170). -* **/D**: Enables the Benchmark [static preprocessor definition](https://learn.microsoft.com/en-us/cpp/build/reference/d-preprocessor-definitions?view=msvc-170). -* **/I**: Adds the arm64 includes to the [list of include directories](https://learn.microsoft.com/en-us/cpp/build/reference/i-additional-include-directories?view=msvc-170). -* **/Fe**: Specifies a name for the [executable file output](https://learn.microsoft.com/en-us/cpp/build/reference/fe-name-exe-file?view=msvc-170). -* **/link**: Specifies [options to pass to linker](https://learn.microsoft.com/en-us/cpp/build/reference/link-pass-options-to-linker?view=msvc-170). - -The linker options used in this command are: +This command uses several important compiler and linker options. The `/O2` flag creates fast code, while `/GL` enables whole program optimization. The `/GENPROFILE` linker option generates a `.pgd` file for PGO, and `/LTCG` specifies link time code generation. The `/PGD` option specifies the database file where profile data will be stored. -* **/LTCG**: Specifies [link time code generation](https://learn.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=msvc-170). -* **/GENPROFILE**: Specifies [generation of a .pgd file for PGO](https://learn.microsoft.com/en-us/cpp/build/reference/genprofile-fastgenprofile-generate-profiling-instrumented-build?view=msvc-170). -* **/PGD**: Specifies a [database for PGO](https://learn.microsoft.com/en-us/cpp/build/reference/pgd-specify-database-for-profile-guided-optimizations?view=msvc-170). -* **/LIBPATH**: Specifies the [additional library path](https://learn.microsoft.com/en-us/cpp/build/reference/libpath-additional-libpath?view=msvc-170). +## Collect PGO profile data on Windows on Arm -Next, run the instrumented binary to generate the profile data: +Run the instrumented binary to generate profile data: ```console .\div_bench.exe ``` -This execution creates profile data files (typically with a `.pgc` extension) in the same directory. +This execution creates profile data files (typically with a `.pgc` extension) in the same directory. The profile data captures information about which code paths execute most frequently and how the program behaves at runtime. -Now recompile the program using the `/USEPROFILE` flag to apply optimizations based on the collected data: +## Rebuild with PGO optimizations + +Now recompile the program using the `/USEPROFILE` flag to apply optimizations based on the collected data: ```console cl /O2 /GL /D BENCHMARK_STATIC_DEFINE /I "$VCPKG\include" /Fe:div_bench_opt.exe div_bench.cpp /link /LTCG:PGOptimize /USEPROFILE /PGD:div_bench.pgd /LIBPATH:"$VCPKG\lib" benchmark.lib benchmark_main.lib shlwapi.lib ``` -In this command, the [USEPROFILE linker option](https://learn.microsoft.com/en-us/cpp/build/reference/useprofile?view=msvc-170) instructs the linker to enable PGO with the profile generated during the previous run of the executable. +The `/USEPROFILE` linker option instructs the linker to enable PGO with the profile generated during the previous run. The compiler can now make informed decisions about code layout, inlining, and other optimizations based on actual runtime behavior. -### Run the optimized binary +## Measure PGO performance gains -Now run the optimized binary: +Run the optimized binary to see the performance improvement: ```console .\div_bench_opt.exe ``` -The following output shows the performance improvement: +The output is similar to: ```output Running ./div_bench.opt @@ -86,4 +74,12 @@ Benchmark Time CPU Iterations baseDiv/1500 2.86 us 2.86 us 244429 ``` -As the terminal output above shows, the average execution time is reduced from 7.90 to 2.86 microseconds. This improvement occurs because the profile data informed the compiler that the input divisor was consistently 1500 during the profiled runs, allowing it to apply specific optimizations. +The warning appears because the Google Benchmark library was built in debug mode, but it doesn't affect the validity of the measurements. + +The average execution time is reduced from 7.90 to 2.86 microseconds, which is a 64% improvement. This result was measured on a Windows on Arm device with Visual Studio 2022 (MSVC 17.0) using the division benchmark with a constant divisor of 1500. Your results may vary depending on your specific hardware and workload. + +The compiler used the profile data to determine that the divisor was consistently 1500, enabling optimizations that wouldn't be possible with static analysis alone. + +## What you've accomplished + +You've applied PGO to reduce execution time by 64% on a division-heavy benchmark. You completed the full PGO workflow: instrument, profile, and optimize. Apply this same technique to performance-critical sections of your own code to achieve similar gains on Windows on Arm. \ No newline at end of file diff --git a/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/_index.md b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/_index.md new file mode 100644 index 0000000000..aa3e8a16a9 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/_index.md @@ -0,0 +1,64 @@ +--- +title: KleidiAI SME2 matmul microkernel for quantized models explained + +draft: true +cascade: + draft: true + +minutes_to_complete: 40 + +who_is_this_for: This is an advanced topic for software developers, performance engineers, and AI practitioners + +learning_objectives: + - Learn how a KleidiAI matmual microkernel performs matrix multiplication with quantized data + - Learn how SME2 INT8 Outer Product Accumulate instructions are used for matrix multiplication + - Learn how a KleidiAI SME2 matmul microkernel accelerates matmul operators in a Large Lanague Model + - Learn how to integrate KleidiAI SME2 matmul microkernels to an AI framework or application + +prerequisites: + - Knowledge of KleidiAI and SME2 + +author: Zenon Zhilong Xiu + +### Tags +skilllevels: Advanced +subjects: ML +armips: + - Arm C1 CPU + - Arm SME2 unit +tools_software_languages: + - C++ + - KleidiAI + - llama.cpp +operatingsystems: + - Android + - Linux + + + +further_reading: + - resource: + title: part 1 Arm Scalable Matrix Extension Introduction + link: https://developer.arm.com/community/arm-community-blogs/b/architectures-and-processors-blog/posts/arm-scalable-matrix-extension-introduction + type: blog + - resource: + title: part 2 Arm Scalable Matrix Extension Instructions + link: https://developer.arm.com/community/arm-community-blogs/b/architectures-and-processors-blog/posts/arm-scalable-matrix-extension-introduction-p2 + type: blog + - resource: + title: part4 Arm SME2 Introduction + link: https://developer.arm.com/community/arm-community-blogs/b/architectures-and-processors-blog/posts/part4-arm-sme2-introduction + type: blog + - resource: + title: Profile llama.cpp performance with Arm Streamline and KleidiAI LLM kernels + link: https://learn.arm.com/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/ + type: blog + + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/_next-steps.md b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/_next-steps.md new file mode 100644 index 0000000000..727b395ddd --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # The weight controls the order of the pages. _index.md always has weight 1. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/explain_with_an_example_p1.md b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/explain_with_an_example_p1.md new file mode 100644 index 0000000000..414c677fad --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/explain_with_an_example_p1.md @@ -0,0 +1,55 @@ +--- +title: Explain the SME2 matmul microkernel with an example - Part 1 +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Explain the SME2 matmul microkernel with an example - Part 1 +By integrating the SME2‑optimized KleidiAI kernels into llama.cpp, the heavy matrix‑multiplication workloads in the K, Q, and V computations of the attention blocks, as well as in the FFN layers, can be delegated to the SME2 matmul microkernel when running the Llama-3.2-3B-Q4_0.gguf model. +In these operators, the LHS (activation) data type is FP32, while RHS (weight) type uses GGML Q4_0 quantized type. + +To make the demonstration easier in this learning path, the LHS dimension [m, k] is simplified to [16, 64], the RHS dimension [n, k] is simplified to [64, 64], and the SME2 SVL is set as 512-bit. + +###Packing the RHS +Although the original Q4_0 RHS(weight) in the model uses INT4 quantization, it is signed INT4 quantization, rather than the unsigned INT4 quantization that the SME2 matmul microkernel requires. Moreover,the layout of the INT4 quantized data and the quantization scale does not meet the requirements of the SME2 matmul microkernel neither. Therefore, the LHS from the model needs to be converted from the signed INT4 data to unsigned INT4 and repacked. +Since the RHS(weight) remains unchanged during the inference, this conversion and packing only need to be performed only once when loading the model. + + +Let us have a close look at GGML Q4_0 quantization first to know how the orginal FP32 weight is quantized to Q4_0 format. +In the Q4_0 model, the Q4_0 weights are stored in layout of [n, k]. +GGML Q4_0 quantizes weights in blocks of 32 floats. For each block, it calculates a scale for the block and then converts each value into a signed 4-bit integer. The scale is stored as FP16. +Then GGML Q4_0 packs the values in a way of, +- the low nibble (bits 0–3) holds the first value (even index) +- and the high nibble (bits 4–7) holds the second value (odd index) +Thus, each byte contains a low/high pair. +The following diagram shows how GGML Q4_0 quantizes and packs the original [n, k] FP32 matrix into Q4_0 type with layout of [n, k]. +![Figure showing GGML Q4_0 quantization alt-text#center](images/q4_0_format.jpg "GGML Q4_0 quantization") + +Unfortunately, the Q4_0 format does not meet the requirements of the SME2 matmul microkernel. It needs to be converted to an unsigned INT4 quantization format and repacked using the *kai_run_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon* function. + +In this example, we use m=16 and k=64. +- The required mr value for the SME2 matmul kernel is obtained using *kai_get_mr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa*. Here, mr=16. +- The required nr value for the SME2 matmul kernel is obtained using *kai_get_nr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa*. Here, nr=64. +- The required kr value for the SME2 matmul kernel is obtained using *kai_get_kr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa*. Here, kr=4. +- The required sr value for the SME2 matmul kernel is obtained using *kai_get_sr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa*. Here, sr=2 (two INT4 elements in a byte). + +The function call stack for this process in llama.cpp when loading the model is as follows: +```text +llama_model_load + llama_model::load_tensors + llama_model_loader::load_all_data + ggml_backend_tensor_set + ggml_backend_cpu_kleidiai_buffer_set_tensor + ggml::cpu::kleidiai::tensor_traits::repack + kai_run_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon +``` +This process can be illustrated with the diagram below. +![Figure showing RHS packing with KleidiAI alt-text#center](images/kai_kernel_packed_rhs.jpg "RHS packing with KleidiAI") + +The numerical label of an element in the diagram is used to indicate its row and column number in the original matrix. For example , +![Figure showing Row_Col lable alt-text#center](images/row_col_lable.png "Row_Col lable") +it indicates that the element locates at row 01, column 02 in the original matrix. This row and column number remains unchanged in its quantized and packed matrix, so that the location of the element can be tracked easily. + +Now, the RHS is converted and packed into a format that can be handled by the SME2 matmul microkernel, allowing the packed RHS to be loaded into SME2 Z registers with sequential memory access. This improves memory access efficiency and reduces cache misses. \ No newline at end of file diff --git a/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/explain_with_an_example_p2.md b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/explain_with_an_example_p2.md new file mode 100644 index 0000000000..4e4851dc64 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/explain_with_an_example_p2.md @@ -0,0 +1,34 @@ +--- +title: Explain the SME2 matmul microkernel with an example - Part 2 +weight: 6 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Explain the SME2 matmul microkernel with an example - Part 2 +Next, the FP32 LHS (activation) needs to be quantized and packed when the llama.cpp graph runner computes the matmul nodes/operators. + +### Quantization and Packing of the LHS +Since the LHS (activation) keep changing, we need to dynamically quantize the original FP32 matrix and pack it into the qsi8d32p1vlx4 format. This can be achieved using the *kai_run_lhs_quant_pack_qsi8d32p_f32_neon* microkernel. + +The function call stack for this process in llama.cpp is as follows: +```text +llama_context::decode + llama_context::process_ubatch + llama_context::graph_compute + ggml_backend_sched_compute_splits + ggml_backend_cpu_graph_compute + ggml_graph_compute //tick off the compute thread + ggml_graph_compute_thread //the compute thread + ggml_compute_forward + ggml_cpu_extra_compute_forward + ggml::cpu::kleidiai::tensor_traits::compute_forward + ggml::cpu::kleidiai::tensor_traits::compute_forward_q4_0 + kai_run_lhs_quant_pack_qsi8d32p_f32_neon +``` +The diagram below illustrates how the RHS is quantized and packed by *kai_run_lhs_quant_pack_qsi8d32p_f32_neon*, +![Figure showing Quantization and Packing of the LHS alt-text#center](images/kai_run_lhs_quant_pack_qsi8d32p_f32_neon_for_sme2.jpg "Quantization and Packing of the LHS") + +The values of mr, nr, and kr can be obtained in the same way as described above. +The mr, nr, and kr together with the matrix dimensions m and k are passed as parameters to *kai_run_lhs_quant_pack_qsi8d32p_f32_neon*. This function quantizes the FP32 LHS to signed INT8 type and packed the quantized data and quantization scales as shown in the diagram above. It divides the m x n matrix into submatrices of size mr x kr (it is 16 x 4) as shown in blocks outlined by dashed lines in the upper matrix of the diagram, and then sequentially packs the rows within each submatrix. This allows the SME2 matmul kernel to load an entire submatrix into an SME2 Z register from contiguous memory, thus reducing cache misses by avoiding loading the submatrix across multiple rows. diff --git a/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/explain_with_an_example_p3.md b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/explain_with_an_example_p3.md new file mode 100644 index 0000000000..caa702f4bd --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/explain_with_an_example_p3.md @@ -0,0 +1,78 @@ +--- +title: Explain the SME2 matmul microkernel with an example- Part 3 +weight: 7 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Explain the SME2 matmul microkernel with an example - Part 3 +Once the required LHS and RHS are both ready, *kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa* microkernel can run now. + +### Run the SME2 matmul microkernel +The operations performed to compute an 16x64 result submatrice (four 16x16 submatrices) (1VL x 4VL) are as follows: + +- Iterate along blocks along K dimension + - Iterate in a block with step of kr (kr=4) + - Load one SME2 SVL-length (512-bit) of data from the quantized and packed LHS (containing 64 INT8 values) into one SME2 Z register + - Load two SME2 SVL-lengths of data from the packed RHS (containing 2 x64x2 INT4 values) into two SME2 Z registers, then use the SME2 LUTI4 lookup table instruction to convert these INT4 values into INT8 type, extending them to four SME2 Z registers (4VL). + - Use the SME2 INT8 Outer Product Accumulate (MPOA) instruction to perform outer product operations with source from the Z register and each of the four Z registers, accumulates the results in four ZA tiles (which are initialized to zero). It produces intermediate results of four 16x16 output submatrices. + The processes of the first itration can be illustrated in the diagram below: +![Figure showing the first itration of the inner loop alt-text#center](images/run_matmul_sme2_step1.jpg "The first itration of the inner loop") + The diagram below illustrates the process of the second iteration along the K dimension, +![Figure showing the second itration of the inner loop alt-text#center](images/run_matmul_sme2_step2.jpg "The second itration of the inner loop") + - After completing the iterations in the block, the intermediate INT32 results of four 16x16 output submatrices are dequantized with the per-block LHS and RHS scale to FP32 floats, using Floating-point Multiply (FMUL), Floating-point Multiply and Accumulate (FMLA) and Signed fixed-point Convert to Floating-point (SCVTF) vector instructions. It produces the intermediate FP32 results of four 16x16 output submatrices. + - Accumulate the FP32 result above + +After completing itration along the K dimension, the FP32 results of four 16x16 output submatrices is ready. Then, save the result into memory. + +The code can be found [here](https://github.com/ARM-software/kleidiai/blob/main/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qai4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qai4c32p4vlx4_1vlx4vl_sme2_mopa_asm.S#L80) +Some comments are added to the code to help understanding the code. +```asm +KAI_ASM_LABEL(label_3) // K Loop + KAI_ASM_INST(0xc00800ff) // zero {za} , zeros the four ZA tile (za0.s, za1.s, za2.s, za3.s) + mov x11, x4 //Set block size +KAI_ASM_LABEL(label_4) // Block Loop + KAI_ASM_INST(0xa0404342) //ld1w {z2.s - z3.s}, pn8/z, [x26] // load two VLs packed RHS data (64x2x2 INT4 data) + addvl x26, x26, #2 // increase RHS address by two VLs + ld1h {z8.h}, p0/z, [x3] //load one VL quantized and packed LHS data (64 INT8 data) + addvl x3, x3, #1 // increase LHS address by one VLs + KAI_ASM_INST(0xc08a4044) // luti4 {z4.b - z5.b}, zt0, z2[0] //use LUT4I instruction to convert INT4 to INT8, one source VL produces two VLs result + KAI_ASM_INST(0xc08a4066) // luti4 {z6.b - z7.b}, zt0, z3[0] //use LUT4I instruction to convert INT4 to INT8, one source VL produces two VLs result + KAI_ASM_INST(0xa0840100) // smopa za0.s, p0/m, p0/m, z8.b, z4.b ] //Outer Product Accumulate with the VL of LHS, the first VL of RHS and ZA0.S + KAI_ASM_INST(0xa0850101) // smopa za1.s, p0/m, p0/m, z8.b, z5.b //Outer Product Accumulate with the VL of LHS, the second VL of RHS and ZA1.S + KAI_ASM_INST(0xa0860102) // smopa za2.s, p0/m, p0/m, z8.b, z6.b //Outer Product Accumulate with the VL of LHS, the third VL of RHS and ZA2.S + KAI_ASM_INST(0xa0870103) // smopa za3.s, p0/m, p0/m, z8.b, z7.b b //Outer Product Accumulate with the VL of LHS, the forth VL of RHS and ZA3.S + + subs x11, x11, #4 //block_index - 4 + b.gt label_4 //end of block iteration? + + // the code below performs per block dequantization of the four tiles with LHS and RHS scales + mov w12, #0 + mov x25, x24 + ld1b {z17.b}, p4/z, [x3] // lhs sum + ld1b {z16.b}, p4/z, [x3, #1, mul vl] // lhs scale + addvl x3, x3, #2 + KAI_ASM_INST(0xa040c354) // ld1w { z20.s - z23.s }, pn8/z, [x26] // rhs zp + KAI_ASM_INST(0xa041c340) // ld1w { z0.s - z3.s }, pn8/z, [x26, #4, mul vl ] // rhs scale + addvl x26, x26, #8 + pfalse p3.b +KAI_ASM_LABEL(label_5) + // omit some codes that perform the block quantization and save the result to memory + …… + blt label_5 + subs x10, x10, x4 //decrease the K index + b.gt label_3 //end of K loop? + +``` +In a single block loop, four pipelined SME2 INT8 MOPA instructions perform 4,096 MAC operations, calculating the intermediate results for the four 16x16 submatrices. It proves that SME2 MOPA can significantly improve matrix multiplication performance. + +To help understand the whole process, we map the first itration of LHS and RHS quantization and packing steps, as well as SME2 outer product accumulate operation and dequantization, back to the original FP32 LHS and RHS operations. Essentially, they equally perform the operation as shown below (there might be some quantization loss), +![Figure showing the original matrix representing of the first itration alt-text#center](images/run_matmul_sme2_original_present_step1.jpg "the original matrix representing of the first itration") + +The second iteration can be mapped back to the original FP32 LHS and RHS operations as below, +![Figure showing the original matrix representing of the second itration alt-text#center](images/run_matmul_sme2_original_present_step2.jpg "the original matrix representing of the second itration") + +**Note**: In this diagram, the RHS is laid out in the dimension of [N, K], which is different from the [K, N] dimension layout of the RHS in the video demonstration of 1VLx4VL. If you interpret the RHS in the diagrams above using the [K, N] dimension, you can match the previous video demonstration with the diagrams above. + +By repeating the submatrix computation across the M and N dimensions, the entire result matrix can be calculated. If a non-empty bias is passed to the SME2 matmul microkernel, it also adds the bias to the result matrix. diff --git a/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/images/kai_kernel_packed_rhs.jpg b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/images/kai_kernel_packed_rhs.jpg new file mode 100644 index 0000000000..590e7595a0 Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/images/kai_kernel_packed_rhs.jpg differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/images/kai_matmul_kernel.jpg b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/images/kai_matmul_kernel.jpg new file mode 100644 index 0000000000..37800a20cc Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/images/kai_matmul_kernel.jpg differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/images/kai_run_lhs_quant_pack_qsi8d32p_f32_neon_for_sme2.jpg b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/images/kai_run_lhs_quant_pack_qsi8d32p_f32_neon_for_sme2.jpg new file mode 100644 index 0000000000..3d9de4e05e Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/images/kai_run_lhs_quant_pack_qsi8d32p_f32_neon_for_sme2.jpg differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/images/q4_0_format.jpg b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/images/q4_0_format.jpg new file mode 100644 index 0000000000..8a8e29af17 Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/images/q4_0_format.jpg differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/images/row_col_lable.png b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/images/row_col_lable.png new file mode 100644 index 0000000000..14497adcf7 Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/images/row_col_lable.png differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/images/run_matmul_sme2_original_present_step1.jpg b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/images/run_matmul_sme2_original_present_step1.jpg new file mode 100644 index 0000000000..e4df39ef0c Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/images/run_matmul_sme2_original_present_step1.jpg differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/images/run_matmul_sme2_original_present_step2.jpg b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/images/run_matmul_sme2_original_present_step2.jpg new file mode 100644 index 0000000000..ecc96e3b58 Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/images/run_matmul_sme2_original_present_step2.jpg differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/images/run_matmul_sme2_step1.jpg b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/images/run_matmul_sme2_step1.jpg new file mode 100644 index 0000000000..2cfddfb42f Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/images/run_matmul_sme2_step1.jpg differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/images/run_matmul_sme2_step2.jpg b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/images/run_matmul_sme2_step2.jpg new file mode 100644 index 0000000000..6f18b50b39 Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/images/run_matmul_sme2_step2.jpg differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/images/sme2_mopa.jpg b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/images/sme2_mopa.jpg new file mode 100644 index 0000000000..84c790710e Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/images/sme2_mopa.jpg differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/kai_matmul_kernel_overview.md b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/kai_matmul_kernel_overview.md new file mode 100644 index 0000000000..f7d9ebdafe --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/kai_matmul_kernel_overview.md @@ -0,0 +1,57 @@ +--- +title: How does a KleidiAI matmual microkernel perform matrix multiplication with quantized data? +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## How does a KleidiAI matmual microkernel perform matrix multiplication with quantized data? +Essentially, a KleidiAI matmul microkernel uses tile-based matrix multiplication(matmul) where small submatrices of the output are computed one by one. +- **mr**: number of rows of Matrix C (and Matrix A) computed at once +- **nr**: number of columns of Matrix C (and Matrix B) computed at once +- **bl**: number of elements from the K dimension processed per block at once +- **kr**: number of elements from the K dimension processed per inner step + +The video below demonstrates how matrix multiplication is carried out using this method. +![Figure showing Tile-Based matrix multiplication with KleidiAI alt-text#center](videos/matrix_tile.gif "Tile-Based matrix multiplication with KleidiAI") + +This process can be denoted with the following pseudocode, +```c +// RHS N LOOP +for(n_idx = 0; n_idx < n; n_idx+=nr){ + // LHS M LOOP + for(m_idx = 0; m_idx < m; m_idx+=mr){ + // K LOOP, break K into blocks first + blocks_in_K= K/bl; // bl is the block length + //Block Loop + for(bl_idx = 0; bl_idx< blocks_in_K; bl_idx += 1) { + //Loop inside a block + krs_in_block= bl/kr; //kr is the number of elements in K dimension per inner loop + for(k_idx = 0; k_idx < krs_in_block; k_idx +=1) { + // Perform the matrix multiplication with source submatrices of size [mr, kr] and [kr, nr] + // Accumulate the matrix multiplication result above into per block level result. + … + } + // Accumulate per block level results along K dimension. When iteration on K dimension is completed,a submatrix of size [mr, nr] of the output matrix is ready + } + //Continue computing a submatrix of size [mr, nr] of the output matrix along M dimension + } + //Continue computing a submatrix of size [mr, nr] of the output matrix along N dimension +} +``` +In general, KleidiAI matmul microkernels implement matrix mulitplication in a similar way as the pseudocode. + +KleidiAI also provides corresponding packing microkernels for the matmul microkernels, in order to make efficient contiguous memory access to the input of the matrix multiplication, reducing cache misses. + +KleidiAI supports quantized matrix multiplication to speed up AI inference on Arm CPUs. Instead of multiplying full precision (FP32) matrices A and B directly, it quantizes: +- The Left Hand Source (LHS , or Left Hand Martix/activation) matrix to 8-bit integers +- The Right Hand Source( RHS, or Left Hand Matrix/weights) matrix to 4-bit or 8-bit integers + +then packs those quantized values into memory layouts suitable for the CPU vector instructions such as Dotprod, I8MM, SME2 instructions. +Runs a microkernel that efficiently computes on packed quantized data, then scales back to floating point. + +This process can be illustrated in the following diagram, +![Figure showing quantized matrix multiplication with KleidiAI kernels alt-text#center](images/kai_matmul_kernel.jpg "Quantized matrix multiplication with KleidiAI kernel") + +Please find more information in this learning path, [Accelerate Generative AI workloads using KleidiAI](https://learn.arm.com/learning-paths/cross-platform/kleidiai-explainer/). \ No newline at end of file diff --git a/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/sme2_mpoa_matmul.md b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/sme2_mpoa_matmul.md new file mode 100644 index 0000000000..6b8ec9a77c --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/sme2_mpoa_matmul.md @@ -0,0 +1,43 @@ +--- +title: How are SME2 INT8 Outer Product Accumulate instructions used in a matrix multiplication? +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## How are SME2 INT8 Outer Product Accumulate instructions used in a matrix multiplication? +The INT8 Outer Product Accumulate instructions calculate the sum of four INT8 outer products, widening results into INT32, then the result is destructively added to the destination tile. +![Figure showing SME2 INT8 MOPA instruction alt-text#center](images/sme2_mopa.jpg "SME2 INT8 MOPA instruction") + +When SME2 SVL is 512-bit, each input register (Zn.B, Zm.B) is treated as a matrix of 16x4 INT8 elements, as if each block of four contiguous elements were transposed. +- The first source, Zn.B contains a 16x4 sub-matrix of 8-bit integer values. +- The second source, Zm.B, contains a 16 x4 sub-matrix of 8-bit integer values. +- The INT8 MOPA instruction calculates a 16x 16 widened 32-bit integer sum of outer products, which is then destructively added to the 32-bit integer destination tile, ZAda. + +The video below shows how SME2 INT8 Outer Product Accumulate instructions are used for matrix multiplication. +![Figure showing Matrix Multiplication with 1VLx1VL SME2 MOPA alt-text#center](videos/matrix_mopa_sme2_1vl.gif "Matrix Multiplication with 1VLx1VL SME2 MOPA") +To calculate the result of a 16x16 sub-matrix in matrix C (element type: INT32): + +First, +- a 16x4 sub-matrix in matrix A (element type: INT8) is loaded to a SME2 Z register, +- a 4x16 sub-matrix in matrix B (element type: INT8) is loaded to another SME2 Z register +- a 16x16 sub-matrix in matrix C is stored in an SME2 ZA tile, which is initialized to zero only once + +Then, the SME2 INT8 MOPA instruction uses the data from these two Z registers to perform the outer product operation and accumulates the results into the ZA tile, which holds the 16x16 sub-matrix of matrix C, thus obtaining an intermediate result for this 16x16 sub-matrix. + +Iterate over the K dimension, repeatedly loading 16x4 submatrices from matrix A and 4×16 submatrices from matrix B. For each step, use the SME2 INT8 MPOA instruction to compute outer products and accumulate the results into the same ZA tile. After completing the iteration over K, this ZA tile holds the final values for the corresponding 16×16 submatrix of matrix C. Finally, store the contents of the ZA tile back to memory. + +Apply the same process to all 16x16 sub-matrices in matrix C to complete the entire matrix computation. + +To improve performance, we can pipeline four MOPA instructions and fully utilize four ZA tiles in ZA storage, each MOPA instruction uses one ZA tile. +The video below demonstrates how the four MOPA instructions are used to perfrom matrix multiplication of one 16x4 submatrix from matrix A and four 4x16 submatrices from matrix B in a single iteration. This approach can be referred to as 1VLx4VL, + +![Figure showing Matrix Multiplication with 1VLx4VL SME2 MOPA alt-text#center](videos/1vlx4vl_sme2_mopa.gif "Matrix Multiplication with 1VLx4VL SME2 MOPA") +The intermediate result of 4x16x16 output submatrix is held in four ZA.S tiles. + +You can find more information about SME2 MOPA here, +- [part 1 Arm Scalable Matrix Extension Introduction](https://developer.arm.com/community/arm-community-blogs/b/architectures-and-processors-blog/posts/arm-scalable-matrix-extension-introduction) +- [part 2 Arm Scalable Matrix Extension Instructions](https://developer.arm.com/community/arm-community-blogs/b/architectures-and-processors-blog/posts/arm-scalable-matrix-extension-introduction-p2) +- [part4 Arm SME2 Introduction](https://developer.arm.com/community/arm-community-blogs/b/architectures-and-processors-blog/posts/part4-arm-sme2-introduction) + \ No newline at end of file diff --git a/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/summary.md b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/summary.md new file mode 100644 index 0000000000..469cd9030a --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/summary.md @@ -0,0 +1,10 @@ +--- +title: Summary +weight: 8 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Summary +This learning path vividly explains how an SME2-optimized KleidiAI microkernel performs quantization and packing of the RHS and LHS, and how it leverages the powerful SME2 MOPA instructions to enhance matrix multiplication performance. We hope this learning path helps developers learn how to integrate the KleidiAI microkernel into their ML/AI frameworks or applications, or to design their own SME2-optimized kernels, thus fully utilizing the potential of SME2. \ No newline at end of file diff --git a/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/the_sme2_matmul_microkernel.md b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/the_sme2_matmul_microkernel.md new file mode 100644 index 0000000000..6843b44fa2 --- /dev/null +++ b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/the_sme2_matmul_microkernel.md @@ -0,0 +1,31 @@ +--- +title: What is the sme2 lvlx4vl microkernel? +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## What is the sme2 lvlx4vl microkernel? +We use a KleidiAI microkernel, *kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa*, to explain KleidiAI SME2 microkernels in detail. It is referred as ‘the SME2 matmul microkernel’ in this learning path onwards, unless otherwise noted. + +“_1vlx4vl” in the name indicates that, in a single inner loop iteration, it computes an intermediate result for a 1VL x 4VL submatrix (one SME2 Streaming Vector Length x four SME2 Streaming Vector Length) of the ouput matrix. Assuming the SME2 SVL is 512 bits, it is a 16 x 64 (512/sizeof(FP32)) x (4 x 512/sizeof(FP32)) submatrix. + +To improve performance, we can pipeline four MOPA instructions and fully utilize four ZA tiles in ZA storage, each MOPA instruction uses one ZA tile. +The video below demonstrates how the four MOPA instructions are used to perfrom matrix multiplication of one 16x4 submatrix (1VL) from matrix A and four 4x16 submatrices from matrix B (4VL) in a single iteration. +![Figure showing Matrix Multiplication with 1VLx4VL SME2 MOPA alt-text#center](videos/1vlx4vl_sme2_mopa.gif "Matrix Multiplication with 1VLx4VL SME2 MOPA") +The intermediate result of 4x16x16 output submatrix is held in four ZA.S tiles. + +“qsi8d32p1vlx4” in the name indicates that it expects the LHS with a layout of [M, K] to be symmetrically quantized into signed INT8 type within blocks of 32 elements. +The entire quantized LHS is then divided into submatrices of size 1VL × 4 (since the SME2 SVL is set as 512 bits, it is 16 × 4). Then, each submatrix is packed row-wise into a contiguous memory layout, all the submatrices are packed in this way one after another. So that when using the packed LHS in the SME2 matmul microkernel, memory accesses are to contiguous addresses, improving cache locality. + +“qsi4c32p4vlx4” in its name indicates that the SME2 matmul microkernel expects the RHS with a layout of [N, K] to be symmetrically quantized into signed INT4 type within blocks of 32 elements. +The entire quantized RHS is then divided into submatrices of size 4VL × 4 (since the SME2 SVL is set as 512 bits, it is 4x16× 4). Each submatrix is packed row-wise into a contiguous memory layout. Since the quantization type is INT4, each byte contains two INT4 elements. In the SME2 matmul microkernel, the SME2 LUTI instructions efficiently dequantize INT4 elements into INT8 type, thereby enabling fast matrix multiplication with SME2 INT8 MOPA instructions. + +“_f32_” in its name indicates that the SME2 matmul microkernel outputs FP32 result matrix. The INT32 result produced by SME2 INT8 MOPA instructions has to be dequantized back to FP32 type. + +Sometimes, the original LHS or RHS may not conform to the quantization and packing format requirement of the SME2 matmul microkernel. The software needs to quantize and pack the LHS and RHS appropriately first. + +Next, we will take llama.cpp and the Llama-3.2-3B-Q4_0.gguf model for example to demonstrate, +- how to quantize and pack the LHS and RHS +- perform matrix multiplication using the SME2 matmul microkernel \ No newline at end of file diff --git a/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/videos/1vlx4vl_sme2_mopa.gif b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/videos/1vlx4vl_sme2_mopa.gif new file mode 100644 index 0000000000..012291cad0 Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/videos/1vlx4vl_sme2_mopa.gif differ diff --git a/bin/pagefind.arm64 b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/videos/matrix_mopa_sme2_1vl.gif old mode 100755 new mode 100644 similarity index 50% rename from bin/pagefind.arm64 rename to content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/videos/matrix_mopa_sme2_1vl.gif index ff32e1c5d2..e3b471a398 Binary files a/bin/pagefind.arm64 and b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/videos/matrix_mopa_sme2_1vl.gif differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/videos/matrix_tile.gif b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/videos/matrix_tile.gif new file mode 100644 index 0000000000..e91549875c Binary files /dev/null and b/content/learning-paths/mobile-graphics-and-gaming/kai_sme2_matmul_ukernel_explained/videos/matrix_tile.gif differ diff --git a/content/learning-paths/mobile-graphics-and-gaming/onnx/02_setup.md b/content/learning-paths/mobile-graphics-and-gaming/onnx/02_setup.md index 718ce5c982..ff08ffd5e2 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/onnx/02_setup.md +++ b/content/learning-paths/mobile-graphics-and-gaming/onnx/02_setup.md @@ -19,28 +19,55 @@ You can choose a variety of hardware, including: The nice thing about ONNX is that the **same model file** can run across all of these, so your setup is flexible. ## Install Python -Depending on the hardware you use you follow different installation paths -1. Linux (Arm64). In the console type: +{{% notice Note %}} +ONNX Runtime provides prebuilt wheels only for specific Python versions. At the time of writing, Python 3.12 is not yet supported by ONNX Runtime on macOS or Arm platforms. If you see an error like: + +```output +ERROR: No matching distribution found for onnxruntime +``` + +it usually means your Python version is too new. Python 3.10 is tested and recommended for this Learning Path. +{{% /notice %}} + +Depending on the hardware you use, follow different installation paths: + +1. Linux (Arm64). Install Python 3.10 by typing in the console: ```console sudo apt update -sudo apt install -y python3 python3-venv python3-pip build-essential libopenblas-dev libgl1 libglib2.0-0 +sudo apt install -y python3.10 python3.10-venv python3.10-dev python3-pip build-essential libopenblas-dev libgl1 libglib2.0-0 ``` -2. macOS (Apple Sillicon): +If Python 3.10 is not available in your default repositories, you can use the deadsnakes PPA: ```console -brew install python +sudo add-apt-repository ppa:deadsnakes/ppa +sudo apt update +sudo apt install -y python3.10 python3.10-venv python3.10-dev ``` +2. macOS (Apple Silicon). Install Python 3.10 using Homebrew: +```console +brew install python@3.10 +``` + +After installation, use `python3.10` explicitly when creating virtual environments. + 3. Windows on Arm: -* Install Python 3.10+ from python.org (Arm64 build). +* Download and install Python 3.10 from [python.org](https://www.python.org/downloads/) (select the Arm64 build). * Ensure pip is on PATH. -After installing Python, open a terminal or console, create a clean virtual environment, and update pip and wheel: +After installing Python 3.10, open a terminal or console, create a clean virtual environment using Python 3.10 explicitly, and update pip and wheel: + +```console +python3.10 -m venv .venv +source .venv/bin/activate +python -m pip install --upgrade pip wheel +``` +On macOS, if `python3.10` is not found, use the full Homebrew path: ```console -python3 -m venv .venv -source .venv/bin/activate # on Windows use: .venv\Scripts\activate +/opt/homebrew/bin/python3.10 -m venv .venv +source .venv/bin/activate python -m pip install --upgrade pip wheel ``` diff --git a/content/learning-paths/mobile-graphics-and-gaming/onnx/03_preparingdata.md b/content/learning-paths/mobile-graphics-and-gaming/onnx/03_preparingdata.md index b5bea9f3b2..db8a544395 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/onnx/03_preparingdata.md +++ b/content/learning-paths/mobile-graphics-and-gaming/onnx/03_preparingdata.md @@ -241,9 +241,18 @@ data/ pip install pandas pyarrow opencv-python tqdm numpy ``` -2. Place the Parquet file (e.g., train_1.parquet) next to the script or update PARQUET_PATH accordingly. Here we used the file from [this location](https://huggingface.co/datasets/Ritvik19/Sudoku-Dataset/blob/main/train_1.parquet). +2. Download the Sudoku dataset from Hugging Face. This Learning Path uses the `train_1.parquet` file from the [Ritvik19/Sudoku-Dataset](https://huggingface.co/datasets/Ritvik19/Sudoku-Dataset) repository. -3. Run the generator +Download the dataset file: +```console +wget https://huggingface.co/datasets/Ritvik19/Sudoku-Dataset/resolve/main/train_1.parquet +``` + +Alternatively, you can download it manually from the [direct link](https://huggingface.co/datasets/Ritvik19/Sudoku-Dataset/blob/main/train_1.parquet) and save it as `train_1.parquet` in your working directory. + +The Parquet file should be placed next to the script, or you can update `PARQUET_PATH` in the code to point to its location. + +3. Run the generator: ```console python3 02_PrepareData.py ``` diff --git a/content/learning-paths/mobile-graphics-and-gaming/onnx/04_training.md b/content/learning-paths/mobile-graphics-and-gaming/onnx/04_training.md index 332db92273..671b8d04ad 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/onnx/04_training.md +++ b/content/learning-paths/mobile-graphics-and-gaming/onnx/04_training.md @@ -61,7 +61,6 @@ import torch.nn.functional as F from torch.utils.data import DataLoader from torchvision import datasets, transforms from tqdm import tqdm -from torch.onnx import dynamo_export from torch.export import Dim import onnxruntime as ort @@ -189,6 +188,15 @@ The file will create two artifacts: Right after export, the script runs a parity test: it feeds the same randomly generated batch through both the PyTorch model and the ONNX model (executed by ONNX Runtime) and prints the mean absolute error between their logits. A tiny value confirms the exported graph faithfully matches your trained network. ## Running the script + +{{% notice Note %}} +The Dynamo-based ONNX exporter requires PyTorch 2.1 or later. If you encounter errors related to `torch.export.Dim` or the `dynamo` parameter, ensure you have an up-to-date PyTorch installation: + +```console +pip install --upgrade torch torchvision +``` +{{% /notice %}} + To run the training script, type: ```console diff --git a/content/learning-paths/mobile-graphics-and-gaming/onnx/05_inference.md b/content/learning-paths/mobile-graphics-and-gaming/onnx/05_inference.md index 3a8fdc1426..32e373289b 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/onnx/05_inference.md +++ b/content/learning-paths/mobile-graphics-and-gaming/onnx/05_inference.md @@ -213,6 +213,13 @@ These artifacts provide both quantitative and qualitative insight into model per In the sample grid, each tile shows one crop together with its True label (T:) and Predicted label (P:), with correct predictions highlighted in green and mistakes highlighted in red. This makes it easy to quickly verify that the classifier behaves sensibly and to spot remaining failure modes. ## Running the script + +Before running the evaluation script, install matplotlib for visualization: + +```console +pip install matplotlib +``` + Run the evaluation script from the project root: ```console diff --git a/content/learning-paths/mobile-graphics-and-gaming/onnx/_index.md b/content/learning-paths/mobile-graphics-and-gaming/onnx/_index.md index b1d1e756f2..ba5b1ed031 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/onnx/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/onnx/_index.md @@ -17,7 +17,7 @@ learning_objectives: - Deploy an optimized ONNX model inside an Android app. prerequisites: - - A development machine with Python 3.10+ installed. + - A development machine with Python 3.10 installed (Python 3.11 also works; Python 3.12 is not yet supported by ONNX Runtime on Arm platforms). - Basic familiarity with PyTorch or TensorFlow. - An Arm64 device (e.g., Raspberry Pi or Android smartphone). - "[Android Studio](https://developer.android.com/studio) installed for deployment testing." diff --git a/content/learning-paths/mobile-graphics-and-gaming/performance_llama_cpp_sme2/_index.md b/content/learning-paths/mobile-graphics-and-gaming/performance_llama_cpp_sme2/_index.md index 522e88ea8a..d5002eb90a 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/performance_llama_cpp_sme2/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/performance_llama_cpp_sme2/_index.md @@ -1,24 +1,20 @@ --- -title: Profile SME2 with llama.cpp and KleidiAI on Android - -draft: true -cascade: - draft: true +title: Measure LLM inference performance with KleidiAI and SME2 on Android minutes_to_complete: 40 who_is_this_for: This is an advanced topic for software developers, performance engineers, and AI practitioners -learning_objectives: - - Build llama.cpp library with KleidiAI and SME2 support - - Profile performance of LLMs running on llama-cli - - Learn how KleidiAI and SME2 accelerates LLM operators +learning_objectives: + - Build llama.cpp with KleidiAI and SME2 support + - Profile LLM inference performance on Android + - Understand how KleidiAI and SME2 accelerate LLM operators prerequisites: - Knowledge of KleidiAI and SME2 - - A Linux host machine (x86_64 or aarch64) for building llama.cpp with the Arm GNU Toolchain used in this Learning Path - - Git, CMake and Android Debug Bridge (ADB) installed on the host machine - - An Android device with Arm SME2 support for running and profiling the built executable + - A Linux host machine (x86_64 or aarch64) for building llama.cpp with the Arm GNU Toolchain + - Git, CMake, and Android Debug Bridge (ADB) installed on your host machine + - An Android device with Arm SME2 support for running and profiling the executable author: Zenon Zhilong Xiu @@ -39,21 +35,21 @@ operatingsystems: further_reading: - resource: - title: part 1 Arm Scalable Matrix Extension Introduction + title: Arm Scalable Matrix Extension introduction, part 1 link: https://developer.arm.com/community/arm-community-blogs/b/architectures-and-processors-blog/posts/arm-scalable-matrix-extension-introduction type: blog - resource: - title: part 2 Arm Scalable Matrix Extension Instructions + title: Arm Scalable Matrix Extension instructions, part 2 link: https://developer.arm.com/community/arm-community-blogs/b/architectures-and-processors-blog/posts/arm-scalable-matrix-extension-introduction-p2 type: blog - resource: - title: part4 Arm SME2 Introduction + title: Arm SME2 introduction, part 4 link: https://developer.arm.com/community/arm-community-blogs/b/architectures-and-processors-blog/posts/part4-arm-sme2-introduction type: blog - resource: title: Profile llama.cpp performance with Arm Streamline and KleidiAI LLM kernels link: https://learn.arm.com/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/ - type: blog + type: website diff --git a/content/learning-paths/mobile-graphics-and-gaming/performance_llama_cpp_sme2/build_llama_cpp.md b/content/learning-paths/mobile-graphics-and-gaming/performance_llama_cpp_sme2/build_llama_cpp.md index 3762837042..0de6893c3e 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/performance_llama_cpp_sme2/build_llama_cpp.md +++ b/content/learning-paths/mobile-graphics-and-gaming/performance_llama_cpp_sme2/build_llama_cpp.md @@ -6,9 +6,11 @@ weight: 4 layout: learningpathall --- +## Overview + In this section, you set up a GCC cross-compile toolchain and build a statically linked `llama-cli` binary with KleidiAI and SME2 enabled. -For convenience, llama.cpp is statically linked. You use the aarch64 GCC cross compile toolchain, *aarch64-none-linux-gnu-*, to build the project. To support SME2, GCC compiler version 14.2 and onwards is required. +For convenience, llama.cpp is statically linked. You use the aarch64 GCC cross compile toolchain, `aarch64-none-linux-gnu-`, to build the project. To support SME2, GCC compiler version 14.2 or later is required. The build uses the Linux-hosted Arm GNU Toolchain. If you are working on macOS or Windows, run these commands in a Linux environment (for example, a Linux VM, container, or a Linux development machine). @@ -36,7 +38,7 @@ tar -xf "${TOOLCHAIN_TAR}" export PATH="$PWD/${TOOLCHAIN_TAR%.tar.xz}/bin:$PATH" ``` -Verify the installation was successful by printing the toolchain version: +Confirm the installation succeeded by printing the compiler version: ```bash aarch64-none-linux-gnu-gcc --version @@ -44,25 +46,25 @@ aarch64-none-linux-gnu-gcc --version ## Clone the llama.cpp repository -The llama.cpp with tag b7610 is used in this tutorial. Newer versions should also work, but they are not tested. +This Learning Path uses llama.cpp tag b7610. Newer versions should also work but are not tested. -Next, download the llama.cpp source code and check out the tag used in this tutorial: +Download the llama.cpp source code and check out that tag: ```bash cd $HOME git clone --depth 1 --branch b7610 https://github.com/ggml-org/llama.cpp.git ``` -Create a new directory *build* under the llama.cpp root directory and change to the new directory: +Create a new `build` directory under the llama.cpp root directory and change to it: ```bash cd $HOME/llama.cpp mkdir build && cd build ``` -## Compile the binary +## Compile llama-cli for Android -Next, configure the project using the following command: +Configure the project. The key flags enable KleidiAI support (`-DGGML_CPU_KLEIDIAI=ON`) and SME2 instructions (`-march=...+sme2`), produce a statically linked binary (`-static`) that runs across Android and Linux environments, and include debug symbols (`-g`) for profiling: ```bash cmake .. \ @@ -91,16 +93,22 @@ Set `CMAKE_C_COMPILER` and `CMAKE_CXX_COMPILER` to your cross compiler path if i The `-static` and `-g` options are specified to produce a statically linked executable, in order to run on different Arm64 Linux/Android environments and include debug information. -Next, build the project, +Build the project: ```bash cd $HOME/llama.cpp/build cmake --build ./ --config Release -j $(nproc) ``` -After the building process completes, verify that *llama-cli* exists in the binary directory: +After the build completes, confirm that `llama-cli` exists in the binary directory: ```bash ls -la $HOME/llama.cpp/build/bin | grep llama-cli ``` -Now that you have a `llama-cli` build with KleidiAI enabled, move on to the next section to run the model on your SME2 device and observe the performance impact when you enable the microkernels. \ No newline at end of file +## What you've accomplished and what's next + +In this section: +- You installed the Arm GNU Toolchain (GCC 14.2) and configured it for aarch64 cross-compilation +- You built a statically linked `llama-cli` binary with KleidiAI and SME2 enabled, ready to run on your Android target + +In the next section, you'll transfer the binary and model to the device and compare inference performance with SME2 on and off. \ No newline at end of file diff --git a/content/learning-paths/mobile-graphics-and-gaming/performance_llama_cpp_sme2/introduction.md b/content/learning-paths/mobile-graphics-and-gaming/performance_llama_cpp_sme2/introduction.md index 31b669083e..8d4c229fd7 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/performance_llama_cpp_sme2/introduction.md +++ b/content/learning-paths/mobile-graphics-and-gaming/performance_llama_cpp_sme2/introduction.md @@ -1,20 +1,28 @@ --- -title: Overview +title: Understand how SME2 and KleidiAI accelerate LLM inference in llama.cpp weight: 2 ### FIXED, DO NOT MODIFY layout: learningpathall --- +## How SME2 and KleidiAI accelerate LLM inference in llama.cpp -## Introduction -In this section, you get a quick mental model of SME2, KleidiAI, and what llama.cpp is doing when it runs LLM inference on an Arm CPU. -Arm’s latest Client CPU processors such as Arm C1 include Scalable Matrix Extension 2 (SME2). SME2 accelerates the matrix-heavy AI operations behind large language models (LLMs), media processing, speech recognition, computer vision, real-time apps and multimodal apps. +In this Learning Path, you'll optimize llama.cpp inference on an Arm CPU by enabling SME2 acceleration through Arm KleidiAI microkernels. -llama.cpp provides extensive support for many LLMs, including Phi, Llama, DeepSeek, Gemma and Qwen. Llama.cpp is designed for efficient CPU-based inference. It enables on-device LLM execution, reducing latency and enhancing privacy. +You will measure the performance difference between the default CPU path and the SME2-optimized path using a 3 billion parameter LLM. By the end, you will understand: -By default llama.cpp integrates with Arm KleidiAI, a suite of optimized microkernels for Arm CPUs. KleidiAI includes SME2 optimized microkernels to get more performance benefits. +- What SME2 changes in the matrix execution path +- How KleidiAI integrates into llama.cpp’s CPU backend +- How to verify that SME2 microkernels are active +- What measurable improvement you should expect -In this learning path, llama.cpp and Llama-3.2-3B-Instruct-Q4_0.gguf model with 3 Billion parameters is used for the tutorial. +Scalable Matrix Extension 2 (SME2) is an Arm architectural feature designed to accelerate matrix-heavy workloads. Large language model (LLM) inference relies heavily on matrix multiplication, especially in transformer layers. When SME2 is available on the CPU, KleidiAI provides optimized microkernels that replace generic implementations inside llama.cpp. + +llama.cpp is a CPU-focused LLM inference engine. On Arm systems, it integrates with KleidiAI by default. If SME2 is supported and enabled at runtime, llama.cpp dispatches SME2-optimized matrix kernels for supported operations. + +This Learning Path uses the `Llama-3.2-3B-Instruct-Q4_0.gguf` model (3 billion parameters) as a reproducible test case. + +In the next section, you'll examine how KleidiAI microkernels integrate into the llama.cpp backend and where SME2 is activated in the execution path. diff --git a/content/learning-paths/mobile-graphics-and-gaming/performance_llama_cpp_sme2/kleidiai_integration.md b/content/learning-paths/mobile-graphics-and-gaming/performance_llama_cpp_sme2/kleidiai_integration.md index f857748b52..a506dcd95b 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/performance_llama_cpp_sme2/kleidiai_integration.md +++ b/content/learning-paths/mobile-graphics-and-gaming/performance_llama_cpp_sme2/kleidiai_integration.md @@ -1,19 +1,30 @@ --- -title: Integration of SME2 optimized KleidiAI microkernels in llama.cpp +title: Trace how KleidiAI and SME2 accelerate llama.cpp from model load to token decode weight: 3 ### FIXED, DO NOT MODIFY layout: learningpathall --- -## Integration of SME2 optimized KleidiAI microkernels in llama.cpp -In this section, you look at how SME2 connects into the Arm software ecosystem by following the path from llama.cpp, through its ggml CPU backend, down to Arm KleidiAI microkernels that take advantage of SME2 when the hardware supports it. +## Overview + +In this section, you trace how SME2 acceleration flows through llama.cpp across the full inference lifecycle — from model load to prefill and token decode. + +Rather than treating SME2 as a feature flag, you examine where acceleration is selected, how weights and activations are packed, and which microkernels execute at each stage of inference. You follow the path from high-level llama.cpp operations down into the ggml-cpu backend and finally into the KleidiAI microkernels that use SME2, I8MM, or DotProd instructions depending on hardware support. + +By the end of this section, you will understand: +- Where KleidiAI integrates into the llama.cpp CPU backend +- Which operators are eligible for SME2 acceleration +- How microkernel selection priority works (SME2 → I8MM → DotProd) +- What changes between model load, prefill, and token decode + +This architectural view prepares you to build, profile, and validate SME2 acceleration in the next sections. ### Where does KleidiAI fit in the llama.cpp CPU backend? The KleidiAI library provides optimized matrix multiplication (matmul) kernels tailored for hardware features such as SME, I8MM, and Dot Product (DotProd) acceleration. In llama.cpp this feature is enabled with the build option `GGML_CPU_KLEIDIAI`. -![Figure showing components of llama.cpp alt-text#center](images/llama_components.jpg "Components of llama.cpp") +![Block diagram showing the llama.cpp architecture with the ggml-cpu backend layer highlighted, and KleidiAI integrated as a CPU trait that dispatches SME2, I8MM, and DotProd microkernels alt-txt#center](images/llama_components.jpg "Components of llama.cpp") KleidiAI is integrated as a trait of `ggml-cpu` in the llama.cpp CPU backend. The integration source code is located in the following directory of llama.cpp: @@ -21,7 +32,7 @@ The integration source code is located in the following directory of llama.cpp: ./ggml/src/ggml-cpu/kleidiai ``` -### Which matmul operators can use KleidiAI microkernels? +### Which matmul operators can use KleidiAI microkernels? KleidiAI matmul microkernels can be used for some types of `GGML_OP_MUL_MAT` operators. The table below lists some matmul operators with specific input and output data type that can be accelerated by KleidiAI microkernels. @@ -39,11 +50,11 @@ In this table, LHS is short for left-hand source (left-hand input matrix) and RH More operators and data types are being supported by KleidiAI microkernels. -### How does the KleidiAI path work? +### How the KleidiAI microkernel selection and packing path works The figure below shows how KleidiAI microkernels are used for `matmul` with `GGML_TYPE_Q4_0` or `GGML_TYPE_Q8_0` RHS(weight). -![Figure showing how kleidiai microkernel is used for quantization, packing and matrix multiply llama.cpp alt-text#center](images/kai_matmul_kernel.jpg "Quantization, packing and matrix multiply microkernels") +![Diagram showing the KleidiAI microkernel pipeline for a quantized matmul operation: the RHS weight tensor is packed once at model load, the LHS activation is quantized and packed each inference step, and the selected SME2, I8MM, or DotProd GEMM kernel executes the matrix multiply alt-txt#center](images/kai_matmul_kernel.jpg "Quantization, packing and matrix multiply microkernels") The packing of `GGML_TYPE_Q4_0` or `GGML_TYPE_Q8_0` weight (RHS) only needs to be performed one time when llama.cpp loads the model and weight tensor data, because the weight never changes during inference. For performance, it repacks the original GGUF weights into a layout optimized for cache-friendly access and DotProd, I8MM, and SME2 operations with the KleidiAI microkernels. @@ -54,11 +65,11 @@ SME2 -> I8MM -> DotProd ``` Once the matmul microkernel is decided, its corresponding RHS packing and LHS quantizing & packing micro-kernel will be used. -## Call stacks +## Execution call stacks for KleidiAI microkernels The call stacks below are included as a way to make this integration concrete. Read each stack from top to bottom: the top frames show the high-level llama.cpp operation you triggered (loading the model or decoding tokens), the middle frames show where execution enters the `ggml-cpu` backend, and the bottom frame is the specific KleidiAI microkernel that runs. The first stack shows a one-time weight repack during model load, which happens because the model weights are constant during inference. The later stacks show what happens during inference: activations change from token to token, so the LHS is quantized and packed each time, and then the selected SME2 GEMM or GEMV microkernel executes the matmul. -### During model load +### RHS weight packing during model load In case of using the Llama-3.2-3B-Instruct-Q4_0.gguf model and SME2 microkernels, RHS packing is done by the *kai_run_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon* microkernel when loading the model. It is shown in following function call stack, @@ -72,7 +83,7 @@ llama_model_load kai_run_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon ``` -### During inference +### LHS activation quantization during inference The F32 activation input matrix (LHS) is dynamically quantized and packed by the *kai_run_lhs_quant_pack_qsi8d32p_f32_neon* microkernel every time, since the activation input keeps changing during the model run. It is done by following function call stack, @@ -92,7 +103,7 @@ llama_context::decode ``` Once the LHS and RHS is ready, KleidiAI matmul microkernel can be executed. -### During prefill +### SME2 GEMM microkernel execution during prefill `Llama-3.2-3B-Instruct-Q4_0.gguf` model and 512-bit SME2 streaming vector length are used as an example. At the Prefill stage, the KleidiAI GEMM microkernel optimized with SME2, *kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa*, is selected by the KleidiAI trait, it produces a dequantized F32 output matrix. It is done right after LHS quantizing and packing by function call stack shown below. ```text @@ -111,7 +122,7 @@ llama_context::decode kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa ``` -### During token decode +### SME2 GEMV microkernel execution during token decode At the LLM decode stage, KleidiAI GEMV micro-kernel optimized with SME2, *kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot*, is selected by the KleidiAI trait in llama.cpp, it produces a dequantized F32 output vector. It is done right after LHS quantizing & packing by function call stack shown below, @@ -131,4 +142,10 @@ llama_context::decode kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot ``` -Now that you know where the KleidiAI SME2 path lives in llama.cpp and what it accelerates, move on to the next section to build the binary for inference. +## What you've accomplished and what's next + +In this section: +- You traced the KleidiAI SME2 path through the llama.cpp CPU backend, from model load through prefill and token decode +- You identified the specific microkernels that handle each stage and the priority order (SME2 → I8MM → DotProd) used for microkernel selection + +In the next section, you'll set up the cross-compile toolchain and build a statically linked `llama-cli` binary with SME2 and KleidiAI enabled. diff --git a/content/learning-paths/mobile-graphics-and-gaming/performance_llama_cpp_sme2/run_llm.md b/content/learning-paths/mobile-graphics-and-gaming/performance_llama_cpp_sme2/run_llm.md index 8cfb1f5a53..f3fc3d37d1 100755 --- a/content/learning-paths/mobile-graphics-and-gaming/performance_llama_cpp_sme2/run_llm.md +++ b/content/learning-paths/mobile-graphics-and-gaming/performance_llama_cpp_sme2/run_llm.md @@ -1,14 +1,14 @@ --- -title: Run the Llama-3.2-3B-Instruct-Q4_0.gguf model with llama-cli +title: Measure SME2 acceleration in llama.cpp on Android weight: 5 ### FIXED, DO NOT MODIFY layout: learningpathall --- -## Run the Llama-3.2-3B-Instruct-Q4_0.gguf model with llama-cli +## Compare performance with SME2 enabled and disabled -In this section, you run the model on an SME2-capable Android device. You will compare performance with SME2 enabled and disabled. +In this section, you run the model on an SME2-capable Android device and compare performance with SME2 enabled and disabled. Put the built `llama-cli` executable and the `Llama-3.2-3B-Instruct-Q4_0.gguf` model file on your Android target that supports SME2. Using Android Debug Bridge (ADB) is usually the easiest way to transfer files and open a shell on the device. @@ -19,8 +19,8 @@ curl -L -o Llama-3.2-3B-Instruct-Q4_0.gguf \ https://huggingface.co/unsloth/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct-Q4_0.gguf ``` -## Transfer the files using ADB -This subsection shows a repeatable way to copy your `llama-cli` binary and GGUF model from your host machine to an Android target device, and then run the same commands from an interactive shell on the device. +### Transfer the files using ADB +This section shows a repeatable way to copy your `llama-cli` binary and GGUF model from your host machine to an Android target device, and then run the same commands from an interactive shell on the device. Enable **Developer options** and **USB debugging** on the Android device, connect it over USB (or over the network if your setup supports it), and verify that your host can see the device: @@ -44,17 +44,17 @@ adb shell cd /data/local/tmp/llama_sme2 ``` -The figure below shows the architecture of Llama-3.2-3B model, -![Figure showing Llama-3.2-3B architecture alt-text#center](images/llama-3.2-3b_architecture.jpg "Architecture of Llama-3.2-3B") +The figure below shows the architecture of the Llama-3.2-3B model: +![Architecture diagram of the Llama-3.2-3B model showing the transformer block structure, including attention heads, feed-forward layers, and the overall token embedding and output projection flow alt-txt#center](images/llama-3.2-3b_architecture.jpg "Architecture of Llama-3.2-3B") -## Inference with SME2 enabled +### Run inference with SME2 enabled -You run the model by binding it to a single Arm C1-Pro core with CPU affinity. To enable SME2 microkernels, you must set the `GGML_KLEIDIAI_SME` environment variable before running the application. +To enable SME2 microkernels, set the `GGML_KLEIDIAI_SME` environment variable before running the application. The flags used here are: `taskset 2` pins the process to CPU core 2 (the Arm C1-Pro core), `-st` enables single-token generation mode to report per-token performance stats, `-C 0x2 -Cb 0x2` sets CPU affinity for operator execution, and `-t 1` limits inference to one thread. If `taskset` is not available on your device, run the command without it. Run an inference on your target device: ```bash -env GGML_KLEIDIAI_SME="1" taskset 2 ./llama-cli -m ./Llama-3.2-3B-Instruct-Q4_0.gguf -st -C 0x2 -Cb 0x2 -t 1 -p "input your prompt" +env GGML_KLEIDIAI_SME="1" taskset 2 ./llama-cli -m ./Llama-3.2-3B-Instruct-Q4_0.gguf -st -C 0x2 -Cb 0x2 -t 1 -p "What is Arm SME2 and how does it accelerate AI workloads?" ``` Here `env GGML_KLEIDIAI_SME="1"` enables SME2 microkernels, `taskset 2` binds the `llama-cli` process to CPU core 2 (the Arm C1-Pro core in our case), `-C 0x2 -Cb 0x2` sets the CPU affinity for operator execution, and `-t 1` sets the number of threads to one. If `taskset` is not available on your target device, run the same command without it. @@ -65,35 +65,37 @@ With the SME2 kernels enabled, you'll notice the following performance output: [ Prompt: 12.3 t/s | Generation: 9.1 t/s ] ``` -## Inference without SME2 enabled +Your results will vary depending on the device and the SME2 streaming vector length available. -For a performance comparison, you can run the model with SME2 microkernels disabled. In this scenario, I8MM and Dotprod microkernels are used instead. +### Baseline performance without SME2 + +For a performance comparison, run the model with SME2 microkernels disabled. In this scenario, I8MM and DotProd microkernels are used instead. ```bash -env GGML_KLEIDIAI_SME="0" taskset 2 ./llama-cli -m ./Llama-3.2-3B-Instruct-Q4_0.gguf -st -C 0x2 -Cb 0x2 -t 1 -p "input your prompt" +env GGML_KLEIDIAI_SME="0" taskset 2 ./llama-cli -m ./Llama-3.2-3B-Instruct-Q4_0.gguf -st -C 0x2 -Cb 0x2 -t 1 -p "What is Arm SME2 and how does it accelerate AI workloads?" ``` -With the resulting performance: +The output is similar to: ```output [ Prompt: 7.9 t/s | Generation: 5.9 t/s ] ``` -## Optional: Use Streamline to profile +## Profile with Arm Streamline (optional) You can profile the model execution with the approach introduced in [Profile llama.cpp performance with Arm Streamline and KleidiAI LLM kernels](https://learn.arm.com/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/). The Streamline Timeline view and Annotate Markers in the figure below show that the token generation speeds up significantly at both Prefill and Decode stage. The PMU event counters show that many SME2 instructions, especially SME2 Integer Outer Product Accumulate instructions at the Prefill stage and SME2 Integer Outer Product instructions at the Decode stage, are used for acceleration. -![Figure showing Streamline Timeline view alt-text#center](images/streamline_timeline_combined.jpg "Combined Streamline Timeline view with and without SME2") +![Arm Streamline Timeline view showing side-by-side token generation runs with SME2 enabled and disabled, with Annotate markers highlighting Prefill and Decode stages and PMU counters showing SME2 Integer Outer Product instructions alt-txt#center](images/streamline_timeline_combined.jpg "Combined Streamline Timeline view with and without SME2") The Streamline Call Paths view below indicates similar speedup, it also shows that DotProd and I8MM KleidiAI microkernels are used instead when SME2 is not enabled. -![Figure showing Streamline Call Paths view alt-text#center](images/streamline_call_paths_combined.jpg "Combined Streamline Call Paths view with and without SME2") +![Arm Streamline Call Paths view comparing inference with SME2 enabled against DotProd and I8MM fallback paths, showing the KleidiAI microkernel selected in each case alt-txt#center](images/streamline_call_paths_combined.jpg "Combined Streamline Call Paths view with and without SME2") -## Optional: Print the kernel names at runtime +## Print KleidiAI kernel names at runtime (optional) -To investigate which operators in the model graph are delegated to KleidiAI microkernels, you can add some code as below to *./ggml/src/ggml-cpu/kleidiai/kleidiai.cpp* and recompile the binary. When you run the inference, the names of operators that make use of KleidiAI microkernels will be printed. This is only for debugging purposes. +To investigate which operators in the model graph are delegated to KleidiAI microkernels, you can add some code as below to `./ggml/src/ggml-cpu/kleidiai/kleidiai.cpp` and recompile the binary. When you run the inference, the names of operators that make use of KleidiAI microkernels will be printed. This is only for debugging purposes. ```cpp bool compute_forward(struct ggml_compute_params * params, struct @@ -125,10 +127,10 @@ kai matmul Q4_0 ffn_up-27 kai matmul Q4_0 ffn_out-27 ``` -## Summary +## What you've accomplished and what's next -KleidiAI support in llama.cpp is still evolving, more operators will be accelerated by KleidiAI microkernels, unleashing greater potential of SME2. +You've run a quantized LLM on an SME2-capable Android device and measured the throughput difference with KleidiAI SME2 microkernels enabled and disabled. The results demonstrate meaningful gains at both the Prefill and Decode stages, making on-device LLM inference faster and more power-efficient on Arm hardware. -With out-of-box KleidiAI and SME2 support in llama.cpp, you can get significant performance uplift at both Prefill and Decode stage, which enhances the experience of running LLM locally on device. +With out-of-the-box KleidiAI and SME2 support in llama.cpp, you can achieve significant performance improvements without any changes to the model or application code. -Now that you have validated the runtime behavior with SME2 on your target device, move on to the Next Steps section to explore profiling and related Arm on-device AI performance resources. +Move on to the Next Steps section to explore profiling and related Arm on-device AI performance resources. diff --git a/content/learning-paths/servers-and-cloud-computing/ai-agent-on-cpu/_index.md b/content/learning-paths/servers-and-cloud-computing/ai-agent-on-cpu/_index.md index ffefd81753..4d0ba09b47 100644 --- a/content/learning-paths/servers-and-cloud-computing/ai-agent-on-cpu/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/ai-agent-on-cpu/_index.md @@ -21,6 +21,11 @@ author: Andrew Choi ### Tags skilllevels: Introductory subjects: ML +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/_index.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/_index.md index e450de9c74..4a3f9d07b8 100644 --- a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/_index.md @@ -22,6 +22,8 @@ author: Julien Simon # Tagging metadata, see the Learning Path guide for the allowed values skilllevels: Introductory subjects: ML +cloud_service_providers: + - AWS armips: - Neoverse tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/_index.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/_index.md index a7421ff49e..b3462f12c4 100644 --- a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/_index.md @@ -22,10 +22,10 @@ author: Julien Simon # Tagging metadata, see the Learning Path guide for the allowed values skilllevels: Introductory subjects: ML -armips: - - Neoverse cloud_service_providers: - Google Cloud +armips: + - Neoverse tools_software_languages: - Google Cloud - Hugging Face diff --git a/content/learning-paths/servers-and-cloud-computing/arm-soc-migration-learning-path/_index.md b/content/learning-paths/servers-and-cloud-computing/arm-soc-migration-learning-path/_index.md index 0c06559bc0..4fbcd591bc 100644 --- a/content/learning-paths/servers-and-cloud-computing/arm-soc-migration-learning-path/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/arm-soc-migration-learning-path/_index.md @@ -1,5 +1,5 @@ --- -title: Migrate applications between Arm platforms with AI assistance +title: Migrate applications between Arm platforms using Kiro Arm SoC Migration Power draft: true cascade: @@ -7,25 +7,31 @@ cascade: minutes_to_complete: 60 -who_is_this_for: This learning path is for developers migrating applications between Arm platforms using using AI-assisted tooling with Kiro's ARM SoC Migration Power. You will learn a practical, repeatable migration workflow through an example that moves an application from the cloud to the edge — from AWS Graviton (Neoverse-based) to Raspberry Pi 5 (Cortex-A based). +who_is_this_for: This is an advanced topic for experienced developers who need to migrate applications between Arm-based platforms using AI-assisted tooling. You will work through a structured, repeatable migration workflow using Kiro Arm SoC Migration Power, moving an application from AWS Graviton3 (Neoverse) to Raspberry Pi 5 (Cortex-A). The techniques apply broadly to cloud-to-edge and cross-architecture migrations across the Arm ecosystem. learning_objectives: - - Install and configure Kiro's ARM SoC Migration Power - - Understand a structured migration workflow applicable across Arm platforms - - Use AI-guided migration to identify platform-specific and hardware-dependent code - - Create Hardware Abstraction Layers with power assistance - - Validate and verify migrations with automated analysis + - Install and configure Kiro Arm SoC Migration Power + - Apply a structured migration workflow across Arm platforms + - Identify platform-specific and hardware-dependent code using AI-guided analysis + - Implement hardware abstraction layers to isolate platform-specific dependencies + - Validate and verify the migrated application using automated analysis prerequisites: - - Access to a source and target Arm platforms (the example uses AWS Graviton3 and Raspberry Pi 5) - - Basic understanding of C programming - - Familiarity with embedded systems, Linux environments, or cloud computing concepts + - Access to both source and target Arm platforms (for example, AWS Graviton3 and Raspberry Pi 5) + - Working knowledge of C programming + - Familiarity with Linux development environments and basic embedded or cloud deployment concepts + - Experience building applications with GCC and CMake author: Daniel Schleicher ### Tags skilllevels: Advanced subjects: Performance and Architecture +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse - Cortex-A @@ -40,7 +46,7 @@ tools_software_languages: further_reading: - resource: - title: Kiro ARM SoC Migration Power Documentation + title: Kiro Arm SoC Migration Power Documentation link: https://kiro.dev/powers/arm-soc-migration type: documentation - resource: @@ -63,4 +69,5 @@ further_reading: ### FIXED, DO NOT MODIFY weight: 1 layout: learningpathall +learning_path_main_page: "yes" --- diff --git a/content/learning-paths/servers-and-cloud-computing/arm-soc-migration-learning-path/graviton-development.md b/content/learning-paths/servers-and-cloud-computing/arm-soc-migration-learning-path/graviton-development.md index 507e669274..9129d6679c 100644 --- a/content/learning-paths/servers-and-cloud-computing/arm-soc-migration-learning-path/graviton-development.md +++ b/content/learning-paths/servers-and-cloud-computing/arm-soc-migration-learning-path/graviton-development.md @@ -7,15 +7,14 @@ layout: learningpathall --- ## Develop application on the source Arm platform -In this section, you will build and validate the application on the source Arm platform before performing any migration steps. -This example uses AWS Graviton3 as the source platform, however the same principles apply to any Arm-Arm migration scenario (e.g., from Raspberry Pi 4 to Pi 5, from i.MX8 to Jetson, etc.). +In this section, you will build and validate the application on the source Arm platform before performing any migration steps. -Your development environment (Kiro IDE) remains local. The Graviton instance acts as the remote Arm test platform. This mirrors real-world workflows where development occurs locally and deployment targets remote Arm systems. +This example uses AWS Graviton3 as the source platform. The same principles apply to any Arm-to-Arm migration scenario, such as Raspberry Pi 4 to Raspberry Pi 5 or i.MX8 to Jetson. -### Download the Application (Local Machine) +### Download the application (local machine) -Download the `sensor-monitor` application to your local machine (where Kiro IDE is installed). +Download the `sensor-monitor` application to your local machine (where Kiro IDE is installed). You inspect the project locally, then build and validate it on the Graviton3 instance. ```bash wget https://github.com/ArmDeveloperEcosystem/arm-learning-paths/raw/main/content/learning-paths/servers-and-cloud-computing/arm-soc-migration-learning-path/projects/sensor-monitor.tar.gz @@ -23,9 +22,9 @@ tar -xzf sensor-monitor.tar.gz cd sensor-monitor ``` -The package includes the complete source code, a Makefile, and platform-specific implementations. You will analyze and migrate this code using the ARM SoC Migration Power. +The archive includes the complete source code, a Makefile, and platform-specific implementations. You will analyze and migrate this code using the Arm SoC Migration Power. -### Upload to the Graviton Instance for Testing +### Upload to the Graviton instance for testing Before migrating, verify that the application builds and runs correctly on the source Arm platform. @@ -35,7 +34,7 @@ Upload the archive to the Graviton instance: scp -i graviton-migration-key.pem sensor-monitor.tar.gz ec2-user@$(aws ec2 describe-instances --filters "Name=tag:Name,Values=graviton-migration-source" "Name=instance-state-name,Values=running" --query 'Reservations[0].Instances[0].PublicIpAddress' --output text):~ ``` -### Build and Test on Graviton +### Build and test on Graviton Connect to the instance: @@ -51,15 +50,17 @@ cd sensor-monitor make ./sensor_monitor ``` -If successful, the application will compile with GCC (AArch64 target) and display simulated sensor readings. + +If successful, the application compiles for AArch64 using GCC and displays simulated sensor readings. This confirms: - * The toolchain is functional - * The application builds cleanly on Arm64 Linux - -This state becomes your baseline reference for migration validation. -### Application Overview +- The toolchain is functional +- The application builds cleanly on Arm64 Linux + +This validated build becomes your migration baseline. Any functional differences after migration can be compared against this known-good state. + +### Application overview The `sensor-monitor` application demonstrates a common embedded/edge design pattern: business logic separated from platform-specific hardware interaction. @@ -74,19 +75,18 @@ sensor-monitor/ └── README.md # Documentation ``` -**Key Components:** +**Key components:** `src/main.c` - Main application that reads sensor data in a loop `include/sensor.h` - Hardware abstraction interface `platform/graviton/sensor_graviton.c` - Simulated sensor for cloud development -## Expected Outcome +## What you've accomplished and what's next + +In this section: -At this stage, you should have: - - A working application running on the source Arm platform - - Simulated sensor output from the Graviton instance - - A validated baseline for functional comparison - -You are now ready to analyze the code using the ARM SoC Migration Power and begin adapting it for the target platform (Raspberry Pi 5). +- You built and ran the sensor-monitor application on the Graviton3 source platform +- You confirmed the toolchain and build process work correctly on Arm64 Linux +- You established your baseline for migration validation -This establishes your baseline application before migration to the target platform. +In the next section, you'll use the Arm SoC Migration Power to analyze the codebase and migrate it to the target platform. diff --git a/content/learning-paths/servers-and-cloud-computing/arm-soc-migration-learning-path/migration.md b/content/learning-paths/servers-and-cloud-computing/arm-soc-migration-learning-path/migration.md index f7a9dd6ed7..b3ffc1d1bb 100644 --- a/content/learning-paths/servers-and-cloud-computing/arm-soc-migration-learning-path/migration.md +++ b/content/learning-paths/servers-and-cloud-computing/arm-soc-migration-learning-path/migration.md @@ -1,42 +1,43 @@ --- -title: Migrate using ARM SoC Migration Power +title: Migrate using Arm SoC Migration Power weight: 4 ### FIXED, DO NOT MODIFY layout: learningpathall --- -## Use ARM SoC Migration Power for AI-guided migration +## Use Arm SoC Migration Power for AI-guided migration -In this section you will learn how to use the ARM SoC Migration Power to migrate your application between Arm-based platforms. The example demonstrates migration from: +In this section, you use the Arm SoC Migration Power to migrate your application between Arm-based platforms. The example demonstrates migration from: - **Source:** AWS Graviton3 (Neoverse-V1, Arm64 Linux, cloud deployment) - **Target:** Raspberry Pi 5 (BCM2712, Cortex-A76, edge deployment) -However, the workflow applies to any Arm-to-Arm migration. +However, the workflow applies to any Arm-to-Arm migration. The migration process follows four phases: discovery, architecture analysis, abstraction design, and platform-specific implementation. -### Initiate Migration +For each step below, an example prompt for the Graviton-to-Pi-5 scenario is provided alongside a general pattern you can adapt for other platform pairs. Being explicit in your prompts improves the quality and precision of the Power’s architectural analysis. -Open Kiro and and describe your migration clearly using the ARM SoC Migration Power. +### Initiate migration + +Open Kiro and describe your migration clearly using the Arm SoC Migration Power. Example prompt: ``` -I want to use the ARM SoC Migration Power to migrate my sensor monitoring +I want to use the Arm SoC Migration Power to migrate my sensor monitoring application from AWS Graviton3 to Raspberry Pi 5 (BCM2712). The application currently uses simulated sensors on Graviton and needs to work with real GPIO and SPI hardware on the Pi 5. ``` -The general pattern to use for other Arm-based platform migrations: +The general pattern for other Arm-based platform migrations: ``` -I want to migrate my [application type] from [source ARM SoC] to [target ARM SoC]. +I want to migrate my [application type] from [source Arm SoC] to [target Arm SoC]. The application uses [source-specific features] and needs [target-specific features]. ``` -Being explicit improves the quality of the Power’s architectural reasoning. -### Discovery Phase +### Discovery phase -The power will prompt you for information about your platforms: +Provide detailed information about both platforms: **Example for Graviton → Raspberry Pi 5:** ``` @@ -46,56 +47,49 @@ Hardware Requirements: GPIO for LEDs, SPI for temperature sensor ``` Next, instruct the Power to analyze your codebase for platform dependencies: -Example Prompt: +Example prompt: ``` Scan my codebase for Graviton-specific code that needs migration to BCM2712. Focus on sensor interfaces and any cloud-specific assumptions. ``` -The general pattern to use for other Arm-based platform migrations: +The general pattern for other Arm-based platform migrations: ``` Scan my codebase for [source platform]-specific code that needs migration to [target platform]. Focus on [platform-specific features]. ``` -This step ensures migration is systematic rather than ad hoc. +This ensures the migration is systematic, traceable, and grounded in explicit platform assumptions. -### Architecture Analysis +### Architecture analysis Now compare the architectural characteristics of the platforms. -Example Prompt: +Example prompt: ``` Compare Graviton3 and BCM2712 architecture capabilities. What are the key differences I need to handle for cloud-to-edge migration? ``` -**Example output for Graviton → Pi 5:** -- CPU differences (Neoverse-V1 vs Cortex-A76) -- Memory constraints (cloud 64GB+ vs edge 4-8GB) -- SIMD capabilities (SVE vs NEON) -- Peripheral requirements (none vs GPIO/SPI/I2C) - -The power analyzes architecture differences for any Arm SoC pair and identifies migration challenges. +For Graviton3 to Pi 5, the Power identifies key differences such as CPU architecture (Neoverse-V1 vs Cortex-A76), available memory (cloud 64 GB+ vs edge 4–8 GB), SIMD capabilities (SVE vs NEON), and peripheral requirements (none vs GPIO/SPI/I2C). The Power analyzes these differences for any Arm SoC pair and identifies the migration challenges you need to address. ### Design the Hardware Abstraction Layer (HAL) -Now formalize a platform-independent interface. Ask the power to design a Hardware Abstraction Layer for your platforms: +Now formalize a platform-independent interface. Ask the Power to design a Hardware Abstraction Layer (HAL) for your platforms: -Example Prompt: +Example prompt: ``` Help me design a HAL layer that supports both Graviton (cloud mocks) and BCM2712 (real hardware). I need GPIO and SPI abstraction. ``` - -The general pattern to use for other Arm-based platform migrations: +The general pattern for other Arm-based platform migrations: ``` Help me design a HAL layer that supports both [source platform] and [target platform]. I need [feature] abstraction. ``` -The Power may propose a structured abstraction such as the example shown `hal/sensor.h`: +The Power may propose a structured abstraction such as the example shown in `hal/sensor.h`: ```c typedef struct { @@ -107,22 +101,22 @@ typedef struct { extern const sensor_hal_t *sensor_hal; ``` -### Implement Target Platform Support +### Implement target platform support Now generate or refactor platform-specific code using the Power: -Example Prompt: +Example prompt: ``` Help me refactor my sensor code for BCM2712 compatibility. Show me how to implement real SPI sensor communication for the Pi 5. ``` -The general pattern to use for other Arm-based platform migrations: +The general pattern for other Arm-based platform migrations: ``` Help me implement [feature] for [target platform]. Show me how to [specific requirement]. ``` -The power will provide target platform-specific code. Example for `platform/bcm2712/sensor_bcm2712.c`: +The Power will provide target platform-specific code. Example for `platform/bcm2712/sensor_bcm2712.c`: ```c #include "hal/spi.h" @@ -143,24 +137,25 @@ void sensor_cleanup(void) { spi_cleanup(); } ``` +Review all generated or refactored code carefully to ensure correctness, performance, and alignment with your hardware constraints. -### Update the Build System +### Update the build system -Ask the power to update your build system for multi-platform support: +Ask the Power to update your build system for multi-platform support: -Example Prompt: +Example prompt: ``` Update my build system for dual Graviton/BCM2712 support with proper platform selection and cross-compilation. ``` -The general pattern to use for other Arm-based platform migrations: +The general pattern for other Arm-based platform migrations: ``` Update my build system for [source platform]/[target platform] support with proper platform selection and cross-compilation. ``` -The power will generate a platform-aware build configuration. Example `CMakeLists.txt`: +The Power will generate a platform-aware build configuration. Example `CMakeLists.txt`: ```cmake cmake_minimum_required(VERSION 3.16) @@ -185,12 +180,13 @@ endif() add_executable(sensor_monitor ${COMMON_SOURCES} ${PLATFORM_SOURCES}) ``` -## Expected Outcome +## What you've accomplished and what's next + +In this section: -After completing this section, you should have used the Kiro Power to: -- Analyze architecture differences between your source and target platforms -- Design HAL interfaces that abstract platform differences -- Generate platform-specific code for both platforms -- Configure the build system for your target platform +- You used the Arm SoC Migration Power to analyze architecture differences between Graviton3 and BCM2712 +- You designed a HAL that abstracts platform differences and preserves portability +- You generated platform-specific code for the target device +- You updated the build system for multi-platform support -This workflow applies to any ARM SoC migration, not just Graviton to Raspberry Pi 5. +In the next section, you'll validate the migration by building for both platforms and running the application on the Raspberry Pi 5. diff --git a/content/learning-paths/servers-and-cloud-computing/arm-soc-migration-learning-path/setup.md b/content/learning-paths/servers-and-cloud-computing/arm-soc-migration-learning-path/setup.md index 8d2f76a7e6..f227bd94ed 100644 --- a/content/learning-paths/servers-and-cloud-computing/arm-soc-migration-learning-path/setup.md +++ b/content/learning-paths/servers-and-cloud-computing/arm-soc-migration-learning-path/setup.md @@ -1,20 +1,20 @@ --- -title: Install ARM SoC Migration Power +title: Install Arm SoC Migration Power weight: 2 ### FIXED, DO NOT MODIFY layout: learningpathall --- -## Install and configure Kiro's ARM SoC Migration Power +## Install and configure Kiro Arm SoC Migration Power -In this section, you will install Kiro IDE, enable the Kiro ARM SoC Migration Power, and prepare the required development environment. +In this section, you will install Kiro IDE, enable Kiro Arm SoC Migration Power, and prepare the development environment. -Kiro runs locally on your development machine. The Migration Power uses the Arm MCP server deployed as a containerized backend (via Docker) to provide Arm-specific guidance. You will also provision an AWS Graviton3 instance to act as the source platform for the migration example. +Kiro runs locally on your development machine. The Migration Power uses the Arm MCP server deployed as a containerized backend (using Docker) to provide Arm-specific guidance. You will also provision an AWS Graviton3 instance to act as the source platform for the migration example. ### Install Kiro IDE -Kiro IDE provides AI-powered development assistance and hosts the ARM SoC Migration Power that will guide you through the migration process. +Kiro IDE provides AI-powered development assistance and hosts the Arm SoC Migration Power that will guide you through the migration process. Download and install Kiro IDE for your platform: @@ -24,33 +24,41 @@ brew install --cask kiro ``` **Windows and Linux:** -Download the installer from [https://kiro.dev](https://kiro.dev) +Visit [https://kiro.dev](https://kiro.dev) to download the installer for your platform. + +After installation completes, launch Kiro IDE to proceed. + -Launch Kiro IDE after installation completes. ### Install the Power in Kiro -The ARM SoC Migration Power extends Kiro with specialized knowledge and tools for migrating applications between Arm platforms. +The Arm SoC Migration Power extends Kiro with specialized knowledge and tools for migrating applications between Arm platforms. + +- Open Kiro IDE +- Navigate to the **Powers** panel. Press Cmd + Shift + P (Mac) or Ctrl + Shift + P (Windows) +- Select the Arm SoC Migration Power in the **Recommended** section +- Select **Install** -1. Open Kiro IDE -2. Navigate to Powers panel. Press Cmd + Shift + P (Mac) or Ctrl + Shift + P (Windows) -3. Click on the ARM SoC Migration Power in the Recommended section -4. Click Install +### Verify installation -### Verify Installation -After installation, test the Power by entering: "I just installed the arm-soc-migration power and want to use it." +After installation, enter the following prompt in Kiro: +```text +I just installed the Arm SoC Migration Power and want to use it. +``` The Power should respond and guide you through any additional setup steps. It supports migrations across a wide range of Arm-based platforms, including: - * AWS Graviton (Neoverse-based servers) - * Raspberry Pi (Cortex-A) - * NVIDIA Jetson - * NXP i.MX - * Other Linux-based Arm SoCs -### Install Prerequisites -The ARM SoC Migration Power uses the Arm MCP (Model Context Protocol) server to provide specialized Arm migration capabilities. The Arm MCP server runs via Docker. +- AWS Graviton (Neoverse-based servers) +- Raspberry Pi (Cortex-A) +- NVIDIA Jetson +- NXP i.MX +- Other Linux-based Arm SoCs + +### Install prerequisites + +The Arm SoC Migration Power uses the Arm MCP (Model Context Protocol) server to provide specialized Arm migration capabilities. The Arm MCP server runs via Docker. Install Docker on your local development machine (required for ARM MCP server): @@ -79,15 +87,15 @@ docker --version ``` {{% notice Note %}} -Ensure Docker is running before using the ARM SoC Migration Power. The power will automatically pull and run the ARM MCP server container when needed. +Ensure Docker is running before using the Arm SoC Migration Power. The Power will automatically pull and run the Arm MCP server container when needed. {{% /notice %}} -### Launch AWS Graviton Instance (Source Platform) +### Launch AWS Graviton3 instance (source platform) -You will use an AWS Graviton instance as the source platform in this migration scenario. +You will use an AWS Graviton3 instance as the source platform in this migration scenario. {{% notice Note %}} -Before proceeding, ensure you are authenticated with AWS CLI. You must have valid AWS credentials configured to create and manage EC2 instances. +Before proceeding, ensure you are authenticated with the AWS CLI. Follow the [AWS CLI install guide](/install-guides/aws-cli/) if you haven't configured credentials yet. Verify your AWS CLI authentication: ```bash @@ -96,12 +104,51 @@ aws sts get-caller-identity If you see an error or need to configure AWS CLI, follow the [AWS CLI Configuration Guide](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html) to set up your credentials. {{% /notice %}} -Create an SSH key, security group, and launch a `c7g.medium` Graviton3 instance: +Create an SSH key and security group, then launch a `c7g.medium` Graviton3 instance. Run each command separately to make it easier to identify any errors. + +Create the SSH key: ```bash -aws ec2 create-key-pair --key-name graviton-migration-key --query 'KeyMaterial' --output text > graviton-migration-key.pem && chmod 400 graviton-migration-key.pem && SG_ID=$(aws ec2 create-security-group --group-name graviton-migration-sg --description "Security group for ARM SoC migration" --query 'GroupId' --output text) && aws ec2 authorize-security-group-ingress --group-id $SG_ID --protocol tcp --port 22 --cidr 0.0.0.0/0 && aws ec2 run-instances --image-id $(aws ec2 describe-images --owners amazon --filters "Name=name,Values=al2023-ami-2023*-arm64" "Name=state,Values=available" --query 'reverse(sort_by(Images, &CreationDate))[0].ImageId' --output text) --instance-type c7g.medium --key-name graviton-migration-key --security-group-ids $SG_ID --tag-specifications 'ResourceType=instance,Tags=[{Key=Name,Value=graviton-migration-source}]' --query 'Instances[0].InstanceId' --output text +aws ec2 create-key-pair --key-name graviton-migration-key \ + --query 'KeyMaterial' --output text > graviton-migration-key.pem +chmod 400 graviton-migration-key.pem ``` -Wait approximately 30 seconds for the instance to enter the running state. Retrieve the SSH command: + +Create the security group and allow SSH access. Restrict SSH access to your current public IP address. + +```bash +SG_ID=$(aws ec2 create-security-group \ + --group-name graviton-migration-sg \ + --description "Security group for Arm SoC migration" \ + --query 'GroupId' --output text) + +MY_IP=$(curl -s https://checkip.amazonaws.com) + +aws ec2 authorize-security-group-ingress \ + --group-id $SG_ID \ + --protocol tcp \ + --port 22 \ + --cidr ${MY_IP}/32 +``` + + +Find the latest Amazon Linux 2023 arm64 AMI and launch the instance: + +```bash +AMI_ID=$(aws ec2 describe-images \ + --owners amazon \ + --filters "Name=name,Values=al2023-ami-2023*-arm64" "Name=state,Values=available" \ + --query 'reverse(sort_by(Images, &CreationDate))[0].ImageId' \ + --output text) +aws ec2 run-instances \ + --image-id $AMI_ID \ + --instance-type c7g.medium \ + --key-name graviton-migration-key \ + --security-group-ids $SG_ID \ + --tag-specifications 'ResourceType=instance,Tags=[{Key=Name,Value=graviton-migration-source}]' \ + --query 'Instances[0].InstanceId' --output text +``` +Wait until the instance state is `running`. Retrieve the SSH command: ```bash echo "ssh -i graviton-migration-key.pem ec2-user@$(aws ec2 describe-instances --filters "Name=tag:Name,Values=graviton-migration-source" "Name=instance-state-name,Values=running" --query 'Reservations[0].Instances[0].PublicIpAddress' --output text)" @@ -109,24 +156,23 @@ echo "ssh -i graviton-migration-key.pem ec2-user@$(aws ec2 describe-instances -- Copy and execute the output command to connect to your instance. -### Install Development Tools on Graviton Instance +### Install development tools on the Graviton3 instance -The Graviton instance needs some development tools (gcc, make) to compile the sensor-monitor example application. You will also install wget and tar for downloading and extracting files. +The Graviton3 instance needs development tools to compile the sensor-monitor example application. -Once connected to your Graviton instance, install the required tools: +Once connected to your Graviton3 instance, install the required tools: ```bash sudo dnf install -y gcc make wget tar ``` -## Expected Outcome +## What you've accomplished and what's next + +In this section: -After completing this section, you should have: -- Kiro IDE installed locally -- ARM SoC Migration Power installed and verified -- AWS Graviton c7g.medium instance running Amazon Linux 2023 (arm64) -- Build tools installed on the source platform -- SSH key saved locally for secure access +- You installed Kiro IDE and the Arm SoC Migration Power +- You provisioned an AWS Graviton3 instance as your source platform +- You installed the build tools needed for the migration example -You are now ready to build and test the application on the source platform before migrating it to the target edge device (Raspberry Pi 5). +In the next section, you'll build and test the sensor-monitor application on the Graviton3 instance to establish a validated baseline before migration. diff --git a/content/learning-paths/servers-and-cloud-computing/arm-soc-migration-learning-path/validation.md b/content/learning-paths/servers-and-cloud-computing/arm-soc-migration-learning-path/validation.md index d9c5f230e5..caec43b9bd 100644 --- a/content/learning-paths/servers-and-cloud-computing/arm-soc-migration-learning-path/validation.md +++ b/content/learning-paths/servers-and-cloud-computing/arm-soc-migration-learning-path/validation.md @@ -7,7 +7,8 @@ layout: learningpathall --- ## Validate migration using the Power's testing recommendations -Migration is not complete until the application is validated on both source and target platforms. + +Migration is not complete until the application is validated on both source and target platforms under realistic conditions. In this section, you will use the Arm SoC Migration Power's testing recommendations to: @@ -18,20 +19,20 @@ In this section, you will use the Arm SoC Migration Power's testing recommendati The example shows Graviton to Raspberry Pi 5, but the validation approach applies to any Arm platform migration. -### Source Platform Build Verification +### Source platform build verification After introducing abstraction layers and multi-platform build support, first confirm that the original platform still behaves correctly: -Example Prompt: +Example prompt: ``` Help me verify my Graviton build still works after adding BCM2712 support. ``` -The general pattern to use with other Arm-based platforms: +The general pattern for other Arm-based platforms: ``` Help me verify my [source platform] build still works after adding [target platform] support. ``` -Follow the power's guidance. Example for Graviton: +Follow the Power's guidance. Example for Graviton: ```bash cmake -DTARGET_PLATFORM=GRAVITON -B build-graviton @@ -39,16 +40,16 @@ cmake --build build-graviton ./build-graviton/sensor_monitor ``` -### Cross-Compile for the Target Platform +### Cross-compile for the target platform Ask the Power for cross-compilation guidance: -Example Prompt: +Example prompt: ``` Guide me through cross-compiling for BCM2712 and deploying to Raspberry Pi 5. ``` -The general pattern to use for other Arm-based platforms: +The general pattern for other Arm-based platforms: ``` Guide me through cross-compiling for [target platform] and deploying to [target device]. ``` @@ -64,21 +65,21 @@ Deploy to target device: scp build-bcm2712/sensor_monitor pi@raspberrypi5:~/ ``` -### Target Platform Functional Testing +### Target platform functional testing Now validate behavior on the actual hardware: -Example Prompt: +Example prompt: ``` What tests should I run on the Raspberry Pi 5 to validate the migration? ``` -The general pattern to use for other Arm-based platforms: +The general pattern for other Arm-based platforms: ``` What tests should I run on [target platform] to validate the migration from [source platform]? ``` -The power will recommend platform-appropriate tests. Example for Raspberry Pi 5: +The Power will recommend platform-appropriate tests. Example for Raspberry Pi 5: - GPIO functionality tests - SPI communication validation - Real sensor reading verification @@ -91,42 +92,28 @@ ssh pi@raspberrypi5 ./sensor_monitor ``` -### Performance Comparison +### Performance comparison -Ask the power to compare platform performance: +Ask the Power to compare platform performance: -Example Prompt: +Example prompt: ``` Compare performance between Graviton development and BCM2712 deployment. ``` -The general pattern to use for other Arm-based platforms: +The general pattern for other Arm-based platforms: ``` Compare performance between [source platform] and [target platform] for my application. ``` -The power will analyze platform-specific characteristics. Example analysis: +The Power will analyze platform-specific characteristics. Example analysis: - CPU performance differences - Memory usage comparison - I/O timing characteristics - Power consumption differences -## Expected Outcome - -After completing validation, you should have: -- Validated source platform build -- Cross-compiled target platform binary -- Power-verified platform-specific tests passing -- Performance comparison report - -## Key Takeaways - -1. **AI-Assisted Migration is Powerful** - The ARM SoC Migration Power provides expert guidance for any Arm-to-Arm platform migration, with automated architecture analysis and code generation following best practices. - -2. **Abstraction Preserves Portability** - HAL layers enable development on one platform while maintaining compatibility with others. The same business logic runs across different Arm SoCs. - -3. **The Power Enforces Safety** - It preserves functional behavior during migration, validates architecture compatibility, and recommends proper testing strategies for your specific platforms. +## What you've accomplished -4. **Workflow is Universal** - Discovery → Analysis → Planning → Implementation → Validation applies to any Arm SoC migration, whether cloud-to-edge, edge-to-edge, or cloud-to-cloud. +You've completed the full migration workflow: you validated the source platform build, cross-compiled for the target, ran platform-specific tests, and compared performance between Graviton3 and Raspberry Pi 5. The Arm SoC Migration Power guided each step with architecture-aware recommendations rather than generic advice. -5. **Example is Adaptable** - While this learning path uses Graviton to Raspberry Pi 5 as an example, the same workflow applies to migrations like i.MX8 to Jetson, Raspberry Pi 4 to Pi 5, or any other ARM platform combination. +The Discovery → Analysis → Planning → Implementation → Validation workflow you followed here applies to any Arm SoC migration, whether cloud-to-edge, edge-to-edge, or between any pair of Arm-based platforms. The HAL pattern preserves your application's business logic across different Arm SoCs so you can adapt the same codebase without starting from scratch. diff --git a/content/learning-paths/servers-and-cloud-computing/azure-arm-template/_index.md b/content/learning-paths/servers-and-cloud-computing/azure-arm-template/_index.md index 5ee81d88c4..33482e1e40 100644 --- a/content/learning-paths/servers-and-cloud-computing/azure-arm-template/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/azure-arm-template/_index.md @@ -21,6 +21,8 @@ author: Pareena Verma ### Tags skilllevels: Introductory subjects: Containers and Virtualization +cloud_service_providers: + - Microsoft Azure armips: - Neoverse tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/benchmark-nlp/_index.md b/content/learning-paths/servers-and-cloud-computing/benchmark-nlp/_index.md index 7f91f0ce13..ba60a57e78 100644 --- a/content/learning-paths/servers-and-cloud-computing/benchmark-nlp/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/benchmark-nlp/_index.md @@ -18,6 +18,11 @@ author: Pareena Verma ### Tags skilllevels: Introductory subjects: ML +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/bitmap_scan_sve2/_index.md b/content/learning-paths/servers-and-cloud-computing/bitmap_scan_sve2/_index.md index 09b634a31d..a58b39fa60 100644 --- a/content/learning-paths/servers-and-cloud-computing/bitmap_scan_sve2/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/bitmap_scan_sve2/_index.md @@ -22,6 +22,11 @@ author: Pareena Verma ### Tags skilllevels: Introductory subjects: Performance and Architecture +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/cca-veraison-aws/_index.md b/content/learning-paths/servers-and-cloud-computing/cca-veraison-aws/_index.md index 143eeef111..594716bc54 100644 --- a/content/learning-paths/servers-and-cloud-computing/cca-veraison-aws/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/cca-veraison-aws/_index.md @@ -18,6 +18,8 @@ author: Paul Howard ### Tags skilllevels: Advanced subjects: Performance and Architecture +cloud_service_providers: + - AWS armips: - Neoverse - Cortex-A diff --git a/content/learning-paths/servers-and-cloud-computing/clair/_index.md b/content/learning-paths/servers-and-cloud-computing/clair/_index.md index 63ceb487f0..0eae7d52f0 100644 --- a/content/learning-paths/servers-and-cloud-computing/clair/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/clair/_index.md @@ -18,6 +18,11 @@ author: Jason Andrews ### Tags skilllevels: Advanced subjects: Containers and Virtualization +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/clickhouse/_index.md b/content/learning-paths/servers-and-cloud-computing/clickhouse/_index.md index bdd493006c..4ad61f333b 100644 --- a/content/learning-paths/servers-and-cloud-computing/clickhouse/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/clickhouse/_index.md @@ -17,6 +17,11 @@ author: Pareena Verma ### Tags skilllevels: Introductory subjects: Databases +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/cobalt/_index.md b/content/learning-paths/servers-and-cloud-computing/cobalt/_index.md index 0204ea2665..00f21f45cf 100644 --- a/content/learning-paths/servers-and-cloud-computing/cobalt/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/cobalt/_index.md @@ -21,6 +21,8 @@ author: Joe Stech # Tagging metadata, see the Learning Path guide for the allowed values skilllevels: Introductory subjects: Containers and Virtualization +cloud_service_providers: + - Microsoft Azure armips: - Neoverse tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/codec/_index.md b/content/learning-paths/servers-and-cloud-computing/codec/_index.md index f069393d0a..16327dd606 100644 --- a/content/learning-paths/servers-and-cloud-computing/codec/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/codec/_index.md @@ -26,6 +26,9 @@ test_maintenance: true ### Tags skilllevels: Introductory subjects: Libraries +cloud_service_providers: + - AWS + - Oracle armips: - Neoverse tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/cplusplus_compilers_flags/_index.md b/content/learning-paths/servers-and-cloud-computing/cplusplus_compilers_flags/_index.md index 6686a66fd9..935a96880e 100644 --- a/content/learning-paths/servers-and-cloud-computing/cplusplus_compilers_flags/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/cplusplus_compilers_flags/_index.md @@ -18,6 +18,11 @@ author: Kieran Hejmadi ### Tags skilllevels: Introductory subjects: Performance and Architecture +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/cpp-profile-guided-optimisation/_index.md b/content/learning-paths/servers-and-cloud-computing/cpp-profile-guided-optimisation/_index.md index cc6f9c4d25..913467986b 100644 --- a/content/learning-paths/servers-and-cloud-computing/cpp-profile-guided-optimisation/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/cpp-profile-guided-optimisation/_index.md @@ -18,6 +18,11 @@ author: Kieran Hejmadi ### Tags skilllevels: Introductory subjects: ML +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/csp/_index.md b/content/learning-paths/servers-and-cloud-computing/csp/_index.md index 6fe58aac26..21a5d1dad7 100644 --- a/content/learning-paths/servers-and-cloud-computing/csp/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/csp/_index.md @@ -17,6 +17,11 @@ prerequisites: ### Tags skilllevels: Introductory subjects: Containers and Virtualization +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/deepseek-cpu/_index.md b/content/learning-paths/servers-and-cloud-computing/deepseek-cpu/_index.md index 3fa5dea73e..08091eb673 100644 --- a/content/learning-paths/servers-and-cloud-computing/deepseek-cpu/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/deepseek-cpu/_index.md @@ -19,6 +19,11 @@ author: ### Tags skilllevels: Introductory subjects: ML +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/disk-io-benchmark/_index.md b/content/learning-paths/servers-and-cloud-computing/disk-io-benchmark/_index.md index 810ea8905b..c75dde76a5 100644 --- a/content/learning-paths/servers-and-cloud-computing/disk-io-benchmark/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/disk-io-benchmark/_index.md @@ -19,6 +19,11 @@ author: Kieran Hejmadi ### Tags skilllevels: Introductory subjects: Performance and Architecture +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/distributed-inference-with-llama-cpp/_index.md b/content/learning-paths/servers-and-cloud-computing/distributed-inference-with-llama-cpp/_index.md index 731c9dd421..b7f2113c58 100644 --- a/content/learning-paths/servers-and-cloud-computing/distributed-inference-with-llama-cpp/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/distributed-inference-with-llama-cpp/_index.md @@ -24,6 +24,8 @@ author: ### Tags skilllevels: Introductory subjects: ML +cloud_service_providers: + - AWS armips: - Neoverse tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/django/_index.md b/content/learning-paths/servers-and-cloud-computing/django/_index.md index 9aa84d0447..d492f00833 100644 --- a/content/learning-paths/servers-and-cloud-computing/django/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/django/_index.md @@ -21,6 +21,11 @@ author: Diego Russo ### Tags skilllevels: Introductory subjects: Web +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/dlrm/_index.md b/content/learning-paths/servers-and-cloud-computing/dlrm/_index.md index 830e0d148b..152fe09759 100644 --- a/content/learning-paths/servers-and-cloud-computing/dlrm/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/dlrm/_index.md @@ -21,6 +21,9 @@ author: ### Tags skilllevels: Introductory subjects: Performance and Architecture +cloud_service_providers: + - AWS + - Google Cloud armips: - Neoverse tools_software_languages: @@ -29,8 +32,6 @@ tools_software_languages: - Google Cloud operatingsystems: - Linux -cloud_service_providers: - - AWS further_reading: - resource: diff --git a/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/1-overview.md b/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/1-overview.md new file mode 100644 index 0000000000..8d70d6472a --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/1-overview.md @@ -0,0 +1,88 @@ +--- +title: Simplify Arm migration with the Docker MCP Toolkit and Arm MCP Server +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Why migrate to Arm? + +Arm-based cloud instances are now widely available across major providers, including AWS Graviton, Azure Cobalt, and Google Cloud Axion. These platforms deliver strong performance-per-watt characteristics and, for many workloads, measurable cost savings compared to equivalent x86 instances. + +For containerized applications written in portable, architecture-neutral code, migration can be straightforward: rebuild the container for `linux/arm64` and redeploy. + +However, many performance-sensitive applications are not architecture-neutral. They may include: + +- x86-specific compiler flags (for example `-mavx2`) +- Hand-optimized assembly +- AVX2 intrinsics mapped directly to Intel vector instructions +- Assumptions about register width, alignment, or instruction semantics + +In these cases, rebuilding the container is not enough. The source code itself must be adapted for Arm. + +## Considerations when migrating from x86 to Arm + +When architecture-specific optimizations are present, migration may involve: + +- Identifying x86-specific intrinsics or assembly +- Updating compiler flags and build configurations +- Mapping AVX2 operations to appropriate NEON equivalents +- Rewriting vectorized code and adjusting loop structures +- Updating Dockerfiles, base images, and compiler flags +- Validating correctness and performance on Arm systems + +These steps are well understood, but they can require careful review across code, build scripts, and container configurations. + +## What the Docker MCP Toolkit provides + +The Docker MCP Toolkit is a management interface in Docker Desktop that lets you discover, configure, and run containerized MCP (Model Context Protocol) servers. It connects these servers to AI coding assistants through a unified gateway. + +## MCP servers for Arm migration + +Three MCP servers work together to support the migration workflow: + +**Arm MCP Server** + +Provides migration-focused tools: +- `migrate_ease_scan` detects x86-specific code and compiler flags +- `check_image` and `skopeo` verify container architecture support +- `knowledge_base_search` accesses learning resources, Arm intrinsics, and software version compatibility +- `mca` performs microarchitectural performance analysis + +**GitHub MCP Server** + +Enables Git repository operations including creating pull requests, managing branches, and committing changes. + +**Sequential Thinking MCP Server** + +Helps the AI assistant break down complex migration decisions into logical steps. + + +## How AI-assisted migration works + +When connected to the Docker MCP Toolkit, an AI coding assistant like GitHub Copilot can coordinate a structured migration workflow: + +- Verify whether container base images support `linux/arm64` using `check_image` or `skopeo` +- Scan the codebase with `migrate_ease_scan` to identify AVX2 intrinsics, x86-specific flags, and other portability considerations +- Use `knowledge_base_search` to find appropriate Arm SIMD equivalents for every x86 intrinsic +- Refactor the code with architecture-specific accuracy +- Update Dockerfiles and build configurations for Arm compatibility +- Create a pull request with the proposed changes using the GitHub MCP Server + +## The demo application + +This Learning Path uses a real-world example: a matrix multiplication benchmark written in C++ with AVX2 intrinsics for x86. You'll migrate it to Arm64 using the AI-assisted workflow described above. + +The demo repository is available at [github.com/JoeStech/docker-blog-arm-migration](https://github.com/JoeStech/docker-blog-arm-migration). + +By the end of this Learning Path, you'll have a working Arm64 container with NEON-optimized code and an automated pull request containing all migration changes. + +## What you've learned and what's next + +You now understand: +- Why Arm migration requires more than rebuilding containers when architecture-specific code is present +- How the Docker MCP Toolkit connects AI assistants to specialized migration tools +- The structured workflow that GitHub Copilot uses to automate migration tasks + +Next, you'll install and configure the Docker MCP Toolkit with the three required MCP servers. diff --git a/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/2-setup.md b/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/2-setup.md new file mode 100644 index 0000000000..a05acdbaa7 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/2-setup.md @@ -0,0 +1,101 @@ +--- +title: Set up Docker MCP Toolkit with Arm, GitHub, and Sequential Thinking servers +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Before you begin + +Make sure the following tools are installed: + +- Docker Desktop 4.59 or later +- VS Code with the GitHub Copilot extension +- A GitHub account with a Personal Access Token (PAT) that allows repository access +- A machine with at least 8 GB RAM (16 GB recommended) + +You'll use Docker Desktop to host MCP servers locally, and VS Code with GitHub Copilot to invoke those servers through the MCP Gateway. + +## Enable the Docker MCP Toolkit + +The MCP Toolkit allows Docker Desktop to run and manage MCP (Model Context Protocol) servers, which expose structured tools that AI assistants can call. + +- Open Docker Desktop +- Go to **Settings** > **Beta features** +- Toggle **Enable Docker MCP Toolkit** on +- Select **Apply** + +The **MCP Toolkit** tab appears in the left sidebar. + +## Add the required MCP servers + +Open the **MCP Toolkit** in Docker Desktop and select the **Catalog** tab. Add the following three servers: + +### Arm MCP Server + +Search for **Arm** in the catalog and add the [Arm MCP Server](https://hub.docker.com/mcp/server/arm-mcp/overview). + +Configure it by setting the directory path to your local code. This allows the `migrate_ease_scan` and `mca` tools to access your source files. Click **Save** after setting the path. + +The Arm MCP Server provides six tools: + +| Tool | Description | +|------|-------------| +| `knowledge_base_search` | Semantic search of Arm learning resources, intrinsics documentation, and software compatibility | +| `migrate_ease_scan` | Code scanner for C++, Python, Go, JavaScript, and Java Arm compatibility analysis | +| `check_image` | Docker image architecture verification for Arm64 support | +| `skopeo` | Remote container image inspection without downloading | +| `mca` | Machine Code Analyzer for assembly performance and IPC predictions | +| `sysreport_instructions` | System architecture information gathering | + +### GitHub Official MCP Server + +Search for **GitHub Official** in the catalog and add the [GitHub MCP Server](https://hub.docker.com/mcp/server/github-official/overview). + +Configure authentication: + +- Select the GitHub Official server +- Choose **Personal Access Token** as the authentication method +- Enter your GitHub token from **GitHub Settings** > **Developer Settings** > **Personal access tokens** + +This server lets GitHub Copilot create pull requests, manage issues, and commit changes directly to your repositories. + +### Sequential Thinking MCP Server + +Search for **Sequential Thinking** in the catalog and add the [Sequential Thinking MCP Server](https://hub.docker.com/mcp/server/sequentialthinking/overview). + +No configuration is needed. This server helps GitHub Copilot break down complex migration decisions into logical steps. + +## Connect VS Code to the MCP Gateway + +- In Docker Desktop, go to **MCP Toolkit** > **Clients** tab +- Scroll to **Visual Studio Code** and select **Connect** +- Open VS Code and select the **Extensions** icon in the left toolbar +- Find **MCP_DOCKER**, select the gear icon, and select **Start Server** + +## Verify the connection + +Open GitHub Copilot Chat in VS Code and ask: + +```text +What Arm migration tools do you have access to? +``` + +If the setup is correct, Copilot lists tools from: + +- Arm MCP Server +- GitHub MCP Server +- Sequential Thinking MCP Server + +This confirms that tool invocation through the MCP Gateway is working. + +## What you've learned and what's next + +You have: +- Enabled the Docker MCP Toolkit in Docker Desktop +- Configured three MCP servers: Arm MCP Server, GitHub MCP Server, and Sequential Thinking MCP Server +- Connected VS Code with GitHub Copilot to the MCP Gateway +- Verified that Copilot can access migration tools + +Next, you'll examine the demo application to identify x86-specific code that needs adaptation for Arm64. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/3-understand-the-demo.md b/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/3-understand-the-demo.md new file mode 100644 index 0000000000..7e18d7c2f3 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/3-understand-the-demo.md @@ -0,0 +1,109 @@ +--- +title: Examine x86 AVX2 intrinsics in the demo application +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Clone the demo repository + +The demo application is a matrix multiplication benchmark written in C++ using AVX2 intrinsics for vectorized performance on x86 processors. +Clone the repository: +```bash +git clone https://github.com/JoeStech/docker-blog-arm-migration +cd docker-blog-arm-migration +``` +This example is intentionally optimized for x86 so that you can see how architecture-specific code appears in practice and how it can be adapted for Arm. + +## Examine the Dockerfile + +Open the `Dockerfile`. There are two areas that require updates for Arm compatibility. + +**Add Arm64 support in the base image**: The centos:6 image was published for x86 architecture and does not provide a `linux/arm64` variant. To run on Arm hardware, the base image must support Arm64. + +Modern multi-architecture base images typically publish both `linux/amd64` and `linux/arm64` manifests. Updating the base image is the first step toward portability. + +**Update compiler flags**: The `-mavx2` flag enables AVX2 vector instructions on x86. Arm processors use different SIMD instruction sets (NEON or SVE), so this flag must be removed or replaced when compiling for Arm. + +Here is the full Dockerfile for reference: +```dockerfile +FROM centos:6 + +RUN yum install -y \ + devtoolset-2-gcc \ + devtoolset-2-gcc-c++ \ + devtoolset-2-binutils \ + make \ + && yum clean all + +WORKDIR /app +COPY *.h *.cpp ./ + +RUN scl enable devtoolset-2 "g++ -O2 -mavx2 -o benchmark \ + main.cpp \ + matrix_operations.cpp \ + -std=c++11" + +CMD ["./benchmark"] +``` + +## Examine the source code +Open `matrix_operations.cpp`. At the top of the file: + +```cpp +#include // x86-only header +``` +The header provides Intel SIMD intrinsics, including AVX and AVX2. On Arm systems, SIMD intrinsics are provided through instead. + +Inside the matrix multiplication routine, you will see AVX2 intrinsics such as: +```cpp +// Inside the multiply function: +__m256d sum_vec = _mm256_setzero_pd(); +__m256d a_vec = _mm256_loadu_pd(&data[i][k]); +sum_vec = _mm256_add_pd(sum_vec, _mm256_mul_pd(a_vec, b_vec)); + +// Horizontal reduction +__m128d sum_high = _mm256_extractf128_pd(sum_vec, 1); +__m128d sum_low = _mm256_castpd256_pd128(sum_vec); +``` +These _mm256_* functions map directly to 256-bit AVX2 instructions. + +## Architecture considerations for Arm + +To run this code on Arm, several adjustments are required: + +1. **SIMD header replacement**: x86 uses `#include `. Arm uses `` instead. + +2. **Intrinsic mapping**: Each AVX2 intrinsic must be mapped to an Arm equivalent. + For example: + - `_mm256_setzero_pd()` creates a 256-bit zero vector of four doubles. Arm NEON uses 128-bit registers. + - `_mm256_loadu_pd()` loads 4 doubles at once (NEON loads 2 with `vld1q_f64`). + - `_mm256_add_pd()` and `_mm256_mul_pd()` are 256-bit operations (NEON uses 128-bit equivalents). + - `_mm256_extractf128_pd()` extracts the high 128 bits (not needed on NEON). + +3. **Vector width differences**: AVX2 operates on 256-bit registers (four double-precision values). NEON operates on 128-bit registers (two double-precision values). This affects: + - Loop stride + - Accumulation logic + - Horizontal reduction patterns + +4. **Horizontal reduction logic**: The AVX2 pattern: + +```cpp + _mm256_extractf128_pd(...) + _mm256_castpd256_pd128(...) +``` +is specific to x86 register structure. On Arm, reduction is implemented using NEON reduction or pairwise-add instructions instead. + +{{% notice Note %}} +On newer Arm platforms supporting SVE or SVE2 (for example Neoverse V1/V2 based platforms), wider vector lengths may be available. SVE uses a vector-length-agnostic (VLA) model, which differs from fixed-width AVX2 and NEON programming. The Arm MCP Server knowledge base can help determine the appropriate approach for your target platform. +{{% /notice %}} + +## What you've learned and what's next + +You have: +- Examined a legacy x86 application with AVX2 intrinsics +- Identified the architecture-specific elements: base image, compiler flags, SIMD headers, and intrinsic functions +- Understood how vector width differences between AVX2 (256-bit) and NEON (128-bit) affect the migration approach + +Next, you'll use GitHub Copilot with the Docker MCP Toolkit to automate the migration process. diff --git a/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/4-run-migration.md b/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/4-run-migration.md new file mode 100644 index 0000000000..1c77d14169 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/4-run-migration.md @@ -0,0 +1,157 @@ +--- +title: Automate x86 to Arm migration with GitHub Copilot +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Open the project in VS Code + +Open the cloned `docker-blog-arm-migration` directory in VS Code: + +```bash +cd docker-blog-arm-migration +code . +``` + +Make sure the MCP_DOCKER server is running in VS Code (use **Extensions** > **MCP_DOCKER** > **Start Server** if needed). + +This allows GitHub Copilot to invoke the configured MCP servers through the MCP Gateway. + +## Provide migration instructions to GitHub Copilot + +Open GitHub Copilot Chat in VS Code and paste the following prompt: + +```text +Your goal is to migrate this codebase from x86 to Arm64. Use the Arm MCP +Server tools to help you with this migration. + +Steps to follow: +1. Check all Dockerfiles - use check_image and/or skopeo tools to verify + Arm compatibility, changing the base image if necessary +2. Scan the codebase - run migrate_ease_scan with the appropriate language + scanner and apply the suggested changes +3. Use knowledge_base_search when you need Arm architecture guidance or + intrinsic equivalents +4. Update compiler flags and dependencies for Arm64 compatibility +5. Create a pull request with all changes using GitHub MCP Server + +Important notes: +- Your current working directory is mapped to /workspace on the MCP server +- NEON lane indices must be compile-time constants, not variables +- If unsure about Arm equivalents, use knowledge_base_search to find docs +- Be sure to find out from the user or system what the target machine is, + and use the appropriate intrinsics. For instance, if neoverse (Graviton, + Axion, Cobalt) is targeted, use the latest SME/SME2. + +After completing the migration: +- Create a pull request with a detailed description of changes +- Include performance predictions and cost savings in the PR description +- List all tools used and validation steps needed +``` +This prompt instructs Copilot to use structured MCP tools rather than relying purely on generated suggestions. + +## Observe the migration workflow + +Copilot now orchestrates the migration using the configured MCP servers. The workflow typically proceeds in several phases. + +### Phase 1: Container Image analysis + +Copilot invokes `check_image` or `skopeo` from the Arm MCP Server: + +```text +Checking centos:6 for arm64 support... +``` + +The tool reports that `centos:6` has no `linux/arm64` build available. Copilot proposes replacing the base image with a modern multi-architecture alternative. +This step ensures the container can build and run on Arm hardware before addressing source-level changes. + +### Phase 2: Source Code scanning + +Copilot runs the `migrate_ease_scan` tool with the C++ scanner: + +```text +Running migrate_ease_scan with scanner: cpp +``` + +The scan detects: + +- AVX2 intrinsics (`_mm256_*` functions) in `matrix_operations.cpp`. +- The `-mavx2` compiler flag in the Dockerfile. +- The x86-specific header ``. + +Each finding includes file locations and recommended actions. This structured scan avoids manually searching through the codebase. + +### Phase 3: Knowledge base lookup and refactoring code + +For each x86 intrinsic found, Copilot queries the Arm MCP Server knowledge base: + +```text +Searching knowledge base for: AVX2 to NEON intrinsic conversion +``` + +The Arm MCP knowledge base provides documented guidance on intrinsic mapping and architecture considerations. +Example mappings: + +| x86 AVX2 Intrinsic | Arm NEON Equivalent | +|---------------------|---------------------| +| `_mm256_setzero_pd()` | Two `vdupq_n_f64(0.0)` operations | +| `_mm256_loadu_pd()` | Two `vld1q_f64()` loads | +| `_mm256_add_pd()` | Two `vaddq_f64()` operations | +| `_mm256_mul_pd()` | Two `vmulq_f64()` operations | + +Because AVX2 operates on 256-bit vectors (four doubles) and NEON operates on 128-bit vectors (two doubles), Copilot adjusts: + - Loop stride + - Accumulation logic + - Horizontal reduction pattern + - +The refactoring typically includes: + - Guarding architecture-specific code with #ifdef __aarch64__ + - Replacing with where appropriate + - Updating compiler flags (for example replacing -mavx2) + - Selecting an Arm-compatible base image such as ubuntu:22.04 + - Supporting multi-architecture builds using TARGETARCH + +All proposed changes should be reviewed before merging. + +### Phase 4: Pull request creation + +Once modifications are complete, Copilot invokes the GitHub MCP Server to: + - Create a branch + - Commit changes + - Open a pull request + +The PR typically includes: + - Updated Dockerfile + - Refactored source files + - A description of the changes + - A summary of MCP tools used + - Suggested validation steps for Arm hardware + +You can see an example PR at [github.com/JoeStech/docker-blog-arm-migration/pull/1](https://github.com/JoeStech/docker-blog-arm-migration/pull/1). + +## Summary of changes + +After migration, you should see: + +**Dockerfile updates**: +- Replaced `centos:6` with `ubuntu:22.04` +- Added `TARGETARCH` for multi-architecture builds +- Changed `-mavx2` to `-march=armv8-a+simd` for Arm builds + +**Source code updates**: +- Added `#ifdef __aarch64__` architecture guards +- Replaced all `_mm256_*` AVX2 intrinsics with NEON equivalents (`vld1q_f64`, `vaddq_f64`, `vmulq_f64`) +- Adjusted loop strides from 4 (AVX2) to 2 (NEON) +- Rewrote horizontal reduction using NEON pair-wise addition + +## What you've learned and what's next + +You have: +- Provided migration instructions to GitHub Copilot +- Observed the AI-driven workflow using MCP server tools +- Reviewed the automated changes: container image updates, intrinsic mapping, and compiler flag adjustments +- Seen how the GitHub MCP Server creates a pull request with all migration changes + +Next, you'll build and validate the migrated application on Arm64 hardware. diff --git a/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/5-validate-and-next-steps.md b/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/5-validate-and-next-steps.md new file mode 100644 index 0000000000..52dbb54c99 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/5-validate-and-next-steps.md @@ -0,0 +1,130 @@ +--- +title: Validate the Arm64 migration and test containers +weight: 6 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Build and test on Arm + +After reviewing and merging the pull request, build the migrated benchmark for Arm64: + +```bash +docker buildx build --platform linux/arm64 -t benchmark:arm64 . --load +``` +This command builds the image using the Arm64 target platform and loads it into your local Docker image cache. + +Run the benchmark: + +```bash +docker run --rm benchmark:arm64 +``` + +Expected output: + +```output +SIMD Matrix Operations Benchmark +================================ +Running on Arm64 architecture with NEON optimizations +=== Matrix Multiplication Benchmark === +Matrix size: 200x200 +Time: 17 ms +Result sum: 1.98888e+08 +``` +Your timing results may vary depending on the underlying hardware. + +## Verify the image architecture + +Confirm the image was built for Arm: + +```bash +docker inspect benchmark:arm64 | grep Architecture +``` + +Expected output: + +```output +"Architecture": "arm64", +``` +This verifies that the container is built for the correct target architecture. + +## Build a multi-architecture image + +To support both x86 and Arm from the same Dockerfile, use `docker buildx`: + +```bash +docker buildx create --name multiarch --use +docker buildx build \ + --platform linux/amd64,linux/arm64 \ + --tag your-registry/benchmark:latest \ + --push . +``` +This produces a multi-architecture manifest that allows Docker to automatically pull the correct image for the host platform. + +## Comparing approaches + +AI-assisted workflows streamline repetitive discovery and mapping tasks, particularly when architecture-specific intrinsics are involved. + +| Approach | Effort | +|----------|--------| +| Manual migration (install tools, research intrinsics, rewrite code, debug, document) | Several hours to days, depending on complexity | +| Docker MCP Toolkit + GitHub Copilot (prompt, review, merge) | Reduced to minutes for initial migration, plus review time | + +Actual time savings depend on codebase size and complexity, but structured tool invocation reduces the need for manual documentation lookup and repetitive edits. + +## Add CI/CD architecture validation + +To prevent regressions, add architecture validation to your CI pipeline. +Example GitHub Actions workflow: + +```yaml +name: Validate Arm64 Support +on: [push, pull_request] + +jobs: + check-arm64: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build for arm64 + run: | + docker buildx build \ + --platform linux/arm64 \ + -t benchmark:arm64-test . +``` +This ensures future changes remain compatible with Arm64 builds. + +## Validation considerations + +Not all AI models produce equal results for migration tasks. While the Arm MCP Server provides structured migration context, AI-generated code should always be reviewed and validated. + +- Always use a current foundational model for best results. +- Test any performance predictions the model makes against actual benchmarks. +- Review the generated NEON code for correctness, especially horizontal reductions and lane indexing. +- NEON lane indices must be compile-time constants, not variables. + +## Explore further + +The Docker MCP Toolkit and Arm MCP Server support more than the example migration shown here: + +- **Multiple languages**: The `migrate_ease_scan` tool supports C++, Python, Go, JavaScript, and Java. +- **Performance analysis**: The `mca` (Machine Code Analyzer) tool predicts IPC and execution time on different CPU architectures. +- **Knowledge base**: The `knowledge_base_search` tool covers all content from [learn.arm.com](https://learn.arm.com) Learning Paths, intrinsics documentation, and software compatibility information. +- **Dynamic MCP**: AI agents can discover and add new MCP servers from the Docker MCP Catalog during a conversation without manual configuration. + +## What you've learned + +In this Learning Path, you: + +- Installed and configured the Docker MCP Toolkit with the Arm MCP Server, GitHub MCP Server, and Sequential Thinking MCP Server +- Connected VS Code with GitHub Copilot to the MCP Gateway +- Examined architecture-specific elements in a legacy x86 AVX2 application +- Used AI-assisted MCP tools to analyze, refactor, and update the codebase for Arm64 +- Built and validated the migrated application on Arm64 + +The Docker MCP Toolkit enables AI assistants to invoke structured migration tools inside the containerized Arm MCP server. This approach reduces manual lookup and repetitive refactoring work while keeping developers in control of review and validation. diff --git a/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/_index.md b/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/_index.md new file mode 100644 index 0000000000..9208107622 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/_index.md @@ -0,0 +1,69 @@ +--- +title: Automate x86 to Arm Migration with Docker MCP Toolkit, VS Code and GitHub Copilot + +description: Learn how to use the Docker MCP Toolkit with the Arm MCP Server and GitHub Copilot to automate container and code migration from x86 to Arm64. Through a hands-on example, migrate a legacy C++ application with AVX2 intrinsics to Arm NEON. + +minutes_to_complete: 45 + +who_is_this_for: This is an advanced topic for developers and DevOps engineers who want to automate the migration of containerized applications from x86 to Arm64 using AI-powered tools in the Docker MCP Toolkit. + +learning_objectives: + - Describe how the Model Context Protocol (MCP) enables AI coding assistants to invoke structured migration tools through the Arm MCP server + - Explain how the Docker MCP Toolkit connects AI coding assistants to Arm MCP server + - Install and configure the Docker MCP Toolkit with the Arm MCP Server, GitHub MCP Server, and Sequential Thinking MCP Server + - Connect the MCP Gateway to VS Code with GitHub Copilot + - Use AI agents to scan codebases for x86-specific dependencies and intrinsics + - Automate the conversion of x86 AVX2 intrinsics to Arm NEON equivalents using the Arm MCP Server knowledge base + - Create and manage pull requests with migrated code using the GitHub MCP Server + +prerequisites: + - Docker Desktop 4.59 or later with MCP Toolkit enabled + - VS Code with the GitHub Copilot extension + - A GitHub account with a personal access token + - A machine with at least 8 GB RAM (16 GB recommended) + - Basic familiarity with Docker, C++, and SIMD intrinsics concepts +author: Ajeet Singh Raina + +### Tags +skilllevels: Advanced +subjects: Containers and Virtualization +armips: + - Neoverse +tools_software_languages: + - Docker + - MCP + - GitHub Copilot + - C++ + - VS Code +operatingsystems: + - Linux + - macOS + +further_reading: + - resource: + title: Docker MCP Toolkit Documentation + link: https://docs.docker.com/ai/mcp-catalog-and-toolkit/toolkit/ + type: documentation + - resource: + title: Arm MCP Server on Docker Hub + link: https://hub.docker.com/mcp/server/arm-mcp/overview + type: website + - resource: + title: Docker MCP Gateway on GitHub + link: https://github.com/docker/mcp-gateway + type: website + - resource: + title: Introducing the Arm MCP Server + link: https://developer.arm.com/community/arm-community-blogs/b/ai-blog/posts/introducing-the-arm-mcp-server-simplifying-cloud-migration-with-ai + type: blog + - resource: + title: Arm MCP Server Learning Path + link: /learning-paths/servers-and-cloud-computing/arm-mcp-server/ + type: learning-path + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/_next-steps.md new file mode 100644 index 0000000000..e20dfa6d43 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 +title: "Next Steps" +layout: "learningpathall" +--- diff --git a/content/learning-paths/servers-and-cloud-computing/dotnet-migration/_index.md b/content/learning-paths/servers-and-cloud-computing/dotnet-migration/_index.md index 8da5c61ffd..ef1a18f89b 100644 --- a/content/learning-paths/servers-and-cloud-computing/dotnet-migration/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/dotnet-migration/_index.md @@ -24,6 +24,8 @@ author: Joe Stech ### Tags skilllevels: Advanced subjects: Performance and Architecture +cloud_service_providers: + - Microsoft Azure armips: - Neoverse tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/envoy/_index.md b/content/learning-paths/servers-and-cloud-computing/envoy/_index.md index 8ae8d1d733..b4bd7888fb 100644 --- a/content/learning-paths/servers-and-cloud-computing/envoy/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/envoy/_index.md @@ -19,6 +19,11 @@ author: Zhengjun Xing ### Tags skilllevels: Introductory subjects: Web +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/envoy_tune/_index.md b/content/learning-paths/servers-and-cloud-computing/envoy_tune/_index.md index acb8172d60..fedd30c25b 100644 --- a/content/learning-paths/servers-and-cloud-computing/envoy_tune/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/envoy_tune/_index.md @@ -20,6 +20,11 @@ author: Zhengjun Xing ### Tags skilllevels: Advanced subjects: Web +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/false-sharing-arm-spe/_index.md b/content/learning-paths/servers-and-cloud-computing/false-sharing-arm-spe/_index.md index 7e3bcbd4b9..1c2a546572 100644 --- a/content/learning-paths/servers-and-cloud-computing/false-sharing-arm-spe/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/false-sharing-arm-spe/_index.md @@ -20,6 +20,11 @@ author: Kieran Hejmadi ### Tags skilllevels: Introductory subjects: Performance and Architecture +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/fexpa/_index.md b/content/learning-paths/servers-and-cloud-computing/fexpa/_index.md index 2103975ffd..8ec011d418 100644 --- a/content/learning-paths/servers-and-cloud-computing/fexpa/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/fexpa/_index.md @@ -36,6 +36,10 @@ further_reading: ### Tags skilllevels: Introductory subjects: Performance and Architecture +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/flink/_index.md b/content/learning-paths/servers-and-cloud-computing/flink/_index.md index 60fb075c45..a19ae0eebb 100644 --- a/content/learning-paths/servers-and-cloud-computing/flink/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/flink/_index.md @@ -20,6 +20,9 @@ skilllevels: Introductory subjects: Databases cloud_service_providers: - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse diff --git a/content/learning-paths/servers-and-cloud-computing/funasr/_index.md b/content/learning-paths/servers-and-cloud-computing/funasr/_index.md index ee20ec87bb..61f6639c02 100644 --- a/content/learning-paths/servers-and-cloud-computing/funasr/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/funasr/_index.md @@ -18,6 +18,11 @@ author: Odin Shen ### Tags skilllevels: Introductory subjects: ML +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/gke-multi-arch-axion/_index.md b/content/learning-paths/servers-and-cloud-computing/gke-multi-arch-axion/_index.md index 9daf5ab6e3..9070845d10 100644 --- a/content/learning-paths/servers-and-cloud-computing/gke-multi-arch-axion/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/gke-multi-arch-axion/_index.md @@ -23,6 +23,8 @@ author: ### Tags skilllevels: Advanced subjects: Containers and Virtualization +cloud_service_providers: + - Google Cloud armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/go-benchmarking-with-sweet/_index.md b/content/learning-paths/servers-and-cloud-computing/go-benchmarking-with-sweet/_index.md index 7ee9536e84..1351109dba 100644 --- a/content/learning-paths/servers-and-cloud-computing/go-benchmarking-with-sweet/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/go-benchmarking-with-sweet/_index.md @@ -19,10 +19,10 @@ author: Geremy Cohen ### Tags skilllevels: Introductory subjects: Performance and Architecture -armips: - - Neoverse cloud_service_providers: - Google Cloud +armips: + - Neoverse tools_software_languages: - Go operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/_index.md b/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/_index.md index 1aedbc7ca3..5732a4f4d9 100644 --- a/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/_index.md @@ -20,6 +20,11 @@ author: Kieran Hejmadi ### Tags skilllevels: Introductory subjects: Performance and Architecture +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/java-on-axion/_index.md b/content/learning-paths/servers-and-cloud-computing/java-on-axion/_index.md index 098cf6fc3a..0e93c8adba 100644 --- a/content/learning-paths/servers-and-cloud-computing/java-on-axion/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/java-on-axion/_index.md @@ -18,9 +18,9 @@ author: Joe Stech ### Tags skilllevels: Introductory +subjects: Performance and Architecture cloud_service_providers: - Google Cloud -subjects: Performance and Architecture armips: - Neoverse V2 tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/jenkins/_index.md b/content/learning-paths/servers-and-cloud-computing/jenkins/_index.md index 53581e9caa..a811a815e6 100644 --- a/content/learning-paths/servers-and-cloud-computing/jenkins/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/jenkins/_index.md @@ -24,6 +24,9 @@ author: Pareena Verma ##### Tags skilllevels: Advanced subjects: CI-CD +cloud_service_providers: + - Microsoft Azure + - Google Cloud armips: - Neoverse diff --git a/content/learning-paths/servers-and-cloud-computing/kafka/_index.md b/content/learning-paths/servers-and-cloud-computing/kafka/_index.md index 34f64a3ffb..89df559300 100644 --- a/content/learning-paths/servers-and-cloud-computing/kafka/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/kafka/_index.md @@ -20,6 +20,9 @@ author: Pareena Verma ### Tags skilllevels: Advanced subjects: Storage +cloud_service_providers: + - AWS + - Google Cloud armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/kedify-http-autoscaling/_index.md b/content/learning-paths/servers-and-cloud-computing/kedify-http-autoscaling/_index.md index 6f62cd94af..8c75b7b429 100644 --- a/content/learning-paths/servers-and-cloud-computing/kedify-http-autoscaling/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/kedify-http-autoscaling/_index.md @@ -20,6 +20,10 @@ author: Zbynek Roubalik ### Tags skilllevels: Introductory subjects: Containers and Virtualization +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/keras-core/_index.md b/content/learning-paths/servers-and-cloud-computing/keras-core/_index.md index f358403b32..ee6b70542a 100644 --- a/content/learning-paths/servers-and-cloud-computing/keras-core/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/keras-core/_index.md @@ -22,6 +22,11 @@ author: ### Tags skilllevels: Introductory subjects: ML +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/kernel-build/_index.md b/content/learning-paths/servers-and-cloud-computing/kernel-build/_index.md index 28fbcc6930..308929d1dc 100644 --- a/content/learning-paths/servers-and-cloud-computing/kernel-build/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/kernel-build/_index.md @@ -21,6 +21,11 @@ author: Geremy Cohen ### Tags skilllevels: Advanced subjects: Performance and Architecture +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/kubearchinspect/_index.md b/content/learning-paths/servers-and-cloud-computing/kubearchinspect/_index.md index af97db7fd3..358d17adaa 100644 --- a/content/learning-paths/servers-and-cloud-computing/kubearchinspect/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/kubearchinspect/_index.md @@ -19,6 +19,11 @@ author: Jason Andrews ### Tags skilllevels: Introductory subjects: Performance and Architecture +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/libhugetlbfs/_index.md b/content/learning-paths/servers-and-cloud-computing/libhugetlbfs/_index.md index ea87140d77..9db28b9625 100644 --- a/content/learning-paths/servers-and-cloud-computing/libhugetlbfs/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/libhugetlbfs/_index.md @@ -17,6 +17,11 @@ author: Bolt Liu skilllevels: Advanced subjects: Databases +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/llama-cpu/_index.md b/content/learning-paths/servers-and-cloud-computing/llama-cpu/_index.md index 7fd7bcafe9..017889b601 100644 --- a/content/learning-paths/servers-and-cloud-computing/llama-cpu/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/llama-cpu/_index.md @@ -22,6 +22,8 @@ author: ### Tags skilllevels: Introductory subjects: ML +cloud_service_providers: + - AWS armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/llama-vision/_index.md b/content/learning-paths/servers-and-cloud-computing/llama-vision/_index.md index 5b7743bff5..3b83fd9718 100644 --- a/content/learning-paths/servers-and-cloud-computing/llama-vision/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/llama-vision/_index.md @@ -24,6 +24,8 @@ skilllevels: Advanced armips: - Neoverse subjects: ML +cloud_service_providers: + - Google Cloud operatingsystems: - Linux tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/lse/_index.md b/content/learning-paths/servers-and-cloud-computing/lse/_index.md index be8bd0339d..28e9795cec 100644 --- a/content/learning-paths/servers-and-cloud-computing/lse/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/lse/_index.md @@ -17,6 +17,11 @@ author: Jason Andrews ### Tags skilllevels: Introductory subjects: Performance and Architecture +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/mariadb/_index.md b/content/learning-paths/servers-and-cloud-computing/mariadb/_index.md index 060fe483b0..5db386e5ae 100644 --- a/content/learning-paths/servers-and-cloud-computing/mariadb/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/mariadb/_index.md @@ -19,6 +19,10 @@ author: Jason Andrews ### Tags skilllevels: Introductory subjects: Databases +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/memcached/_index.md b/content/learning-paths/servers-and-cloud-computing/memcached/_index.md index bcc32283cf..dd9f45df31 100644 --- a/content/learning-paths/servers-and-cloud-computing/memcached/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/memcached/_index.md @@ -23,6 +23,9 @@ test_maintenance: true ### Tags skilllevels: Introductory subjects: Web +cloud_service_providers: + - AWS + - Oracle armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/memcached_cache/_index.md b/content/learning-paths/servers-and-cloud-computing/memcached_cache/_index.md index 562fdee0f2..b8385753f5 100644 --- a/content/learning-paths/servers-and-cloud-computing/memcached_cache/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/memcached_cache/_index.md @@ -29,6 +29,10 @@ test_maintenance: true ### Tags skilllevels: Advanced subjects: Web +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud armips: - Neoverse tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/microbenchmark-network-iperf3/_index.md b/content/learning-paths/servers-and-cloud-computing/microbenchmark-network-iperf3/_index.md index ef145877a8..078aa543ed 100644 --- a/content/learning-paths/servers-and-cloud-computing/microbenchmark-network-iperf3/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/microbenchmark-network-iperf3/_index.md @@ -19,6 +19,11 @@ author: Kieran Hejmadi ### Tags skilllevels: Introductory subjects: Performance and Architecture +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/migration/_index.md b/content/learning-paths/servers-and-cloud-computing/migration/_index.md index 9361277b38..9aec54e89a 100644 --- a/content/learning-paths/servers-and-cloud-computing/migration/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/migration/_index.md @@ -19,6 +19,11 @@ author: Jason Andrews ### Tags skilllevels: Introductory subjects: Libraries +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/milvus-rag/_index.md b/content/learning-paths/servers-and-cloud-computing/milvus-rag/_index.md index 64476da84c..8a010f317d 100644 --- a/content/learning-paths/servers-and-cloud-computing/milvus-rag/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/milvus-rag/_index.md @@ -19,6 +19,11 @@ author: Chen Zhang ### Tags skilllevels: Introductory subjects: ML +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/ml-perf/_index.md b/content/learning-paths/servers-and-cloud-computing/ml-perf/_index.md index 3b26f12797..7ba08313dc 100644 --- a/content/learning-paths/servers-and-cloud-computing/ml-perf/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/ml-perf/_index.md @@ -26,6 +26,9 @@ test_maintenance: true ### Tags skilllevels: Introductory subjects: ML +cloud_service_providers: + - AWS + - Oracle armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/mongodb/_index.md b/content/learning-paths/servers-and-cloud-computing/mongodb/_index.md index 91b7774aad..da8c3638a1 100644 --- a/content/learning-paths/servers-and-cloud-computing/mongodb/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/mongodb/_index.md @@ -23,6 +23,11 @@ layout: learningpathall learning_path_main_page: 'yes' skilllevels: Introductory subjects: Databases +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle test_maintenance: false tools_software_languages: - MongoDB diff --git a/content/learning-paths/servers-and-cloud-computing/mpi/_index.md b/content/learning-paths/servers-and-cloud-computing/mpi/_index.md index cebe99bdb6..7e5f541b53 100644 --- a/content/learning-paths/servers-and-cloud-computing/mpi/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/mpi/_index.md @@ -20,6 +20,11 @@ author: Florent Lebeau ### Tags skilllevels: Advanced subjects: Performance and Architecture +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/mysql/_index.md b/content/learning-paths/servers-and-cloud-computing/mysql/_index.md index e09a969610..235b79b0ba 100644 --- a/content/learning-paths/servers-and-cloud-computing/mysql/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/mysql/_index.md @@ -18,7 +18,10 @@ author: Jason Andrews skilllevels: Introductory subjects: Databases cloud_service_providers: + - AWS + - Microsoft Azure - Google Cloud + - Oracle armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/mysql_benchmark/_index.md b/content/learning-paths/servers-and-cloud-computing/mysql_benchmark/_index.md index d8e6e81cdd..51e2c28332 100644 --- a/content/learning-paths/servers-and-cloud-computing/mysql_benchmark/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/mysql_benchmark/_index.md @@ -17,6 +17,11 @@ author: Bolt Liu skilllevels: Introductory subjects: Databases +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/mysql_tune/_index.md b/content/learning-paths/servers-and-cloud-computing/mysql_tune/_index.md index 08b04026d5..4c66347fb5 100644 --- a/content/learning-paths/servers-and-cloud-computing/mysql_tune/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/mysql_tune/_index.md @@ -16,7 +16,10 @@ author: Julio Suarez skilllevels: Advanced subjects: Databases cloud_service_providers: - - Google Cloud + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/_index.md b/content/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/_index.md index b2974b3c3b..833dc52cc3 100644 --- a/content/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/neoverse-rdv3-swstack/_index.md @@ -26,6 +26,11 @@ author: ### Tags skilllevels: Advanced subjects: Containers and Virtualization +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/net-aspire/_index.md b/content/learning-paths/servers-and-cloud-computing/net-aspire/_index.md index 87b5b8d6aa..96cd0d3c95 100644 --- a/content/learning-paths/servers-and-cloud-computing/net-aspire/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/net-aspire/_index.md @@ -21,6 +21,7 @@ author: Dawid Borycki skilllevels: Introductory subjects: Containers and Virtualization cloud_service_providers: + - AWS - Google Cloud armips: diff --git a/content/learning-paths/servers-and-cloud-computing/nginx/_index.md b/content/learning-paths/servers-and-cloud-computing/nginx/_index.md index 3b8c368c30..29f14ce283 100644 --- a/content/learning-paths/servers-and-cloud-computing/nginx/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/nginx/_index.md @@ -20,6 +20,11 @@ author: Julio Suarez ### Tags skilllevels: Introductory subjects: Web +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/nginx_tune/_index.md b/content/learning-paths/servers-and-cloud-computing/nginx_tune/_index.md index 18b93a4195..97032c8bf6 100644 --- a/content/learning-paths/servers-and-cloud-computing/nginx_tune/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/nginx_tune/_index.md @@ -21,6 +21,11 @@ author: Julio Suarez ### Tags skilllevels: Advanced subjects: Web +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/nlp-hugging-face/_index.md b/content/learning-paths/servers-and-cloud-computing/nlp-hugging-face/_index.md index efddeacee2..c7be9fb269 100644 --- a/content/learning-paths/servers-and-cloud-computing/nlp-hugging-face/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/nlp-hugging-face/_index.md @@ -17,6 +17,11 @@ author: Pareena Verma ### Tags skilllevels: Introductory subjects: ML +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/onnx/_index.md b/content/learning-paths/servers-and-cloud-computing/onnx/_index.md index 8cf67f15c0..db04a08937 100644 --- a/content/learning-paths/servers-and-cloud-computing/onnx/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/onnx/_index.md @@ -20,11 +20,11 @@ author: Nobel Chowdary Mandepudi ### Tags skilllevels: Advanced -cloud_service_providers: - - Microsoft Azure armips: - Neoverse subjects: ML +cloud_service_providers: + - Microsoft Azure operatingsystems: - Linux tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/openshift/_index.md b/content/learning-paths/servers-and-cloud-computing/openshift/_index.md index 876477f9c8..45e9301f9a 100644 --- a/content/learning-paths/servers-and-cloud-computing/openshift/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/openshift/_index.md @@ -19,6 +19,8 @@ author: Jeff Young # Tags skilllevels: Advanced subjects: CI-CD +cloud_service_providers: + - AWS armips: - Neoverse tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/opentelemetry/_index.md b/content/learning-paths/servers-and-cloud-computing/opentelemetry/_index.md index 8263d0e7d6..20b563b6d4 100644 --- a/content/learning-paths/servers-and-cloud-computing/opentelemetry/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/opentelemetry/_index.md @@ -21,8 +21,8 @@ author: Pareena Verma ##### Tags skilllevels: Introductory subjects: Performance and Architecture -cloud_service_providers: -- Google Cloud +cloud_service_providers: + - Google Cloud armips: - Neoverse @@ -70,4 +70,3 @@ weight: 1 layout: "learningpathall" learning_path_main_page: yes --- - diff --git a/content/learning-paths/servers-and-cloud-computing/pac/_index.md b/content/learning-paths/servers-and-cloud-computing/pac/_index.md index 97aee0e01c..f635ab6a56 100644 --- a/content/learning-paths/servers-and-cloud-computing/pac/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/pac/_index.md @@ -20,6 +20,11 @@ author: Pareena Verma ### Tags skilllevels: Advanced subjects: Performance and Architecture +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/pinning-threads/_index.md b/content/learning-paths/servers-and-cloud-computing/pinning-threads/_index.md index f3357c1b3c..9723089cad 100644 --- a/content/learning-paths/servers-and-cloud-computing/pinning-threads/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/pinning-threads/_index.md @@ -22,6 +22,11 @@ author: Kieran Hejmadi ### Tags skilllevels: Advanced subjects: Performance and Architecture +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/postgresql/_index.md b/content/learning-paths/servers-and-cloud-computing/postgresql/_index.md index 8984876121..3937b08cb4 100644 --- a/content/learning-paths/servers-and-cloud-computing/postgresql/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/postgresql/_index.md @@ -17,6 +17,11 @@ author: Jason Andrews ### Tags skilllevels: Introductory subjects: Databases +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/postgresql_tune/_index.md b/content/learning-paths/servers-and-cloud-computing/postgresql_tune/_index.md index 9adc6b6491..e86f03a5ea 100644 --- a/content/learning-paths/servers-and-cloud-computing/postgresql_tune/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/postgresql_tune/_index.md @@ -20,6 +20,11 @@ test_maintenance: true ### Tags skilllevels: Advanced subjects: Databases +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/pytorch-llama/_index.md b/content/learning-paths/servers-and-cloud-computing/pytorch-llama/_index.md index 364004c9e8..d61d0e6efc 100644 --- a/content/learning-paths/servers-and-cloud-computing/pytorch-llama/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/pytorch-llama/_index.md @@ -23,6 +23,11 @@ author: ### Tags skilllevels: Introductory subjects: ML +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/rabbitmq-gcp/_index.md b/content/learning-paths/servers-and-cloud-computing/rabbitmq-gcp/_index.md index 1fdd1f5e87..e413aa9de4 100644 --- a/content/learning-paths/servers-and-cloud-computing/rabbitmq-gcp/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/rabbitmq-gcp/_index.md @@ -24,6 +24,9 @@ author: Pareena Verma ##### Tags skilllevels: Introductory subjects: Databases +cloud_service_providers: + - Microsoft Azure + - Google Cloud armips: - Neoverse diff --git a/content/learning-paths/servers-and-cloud-computing/rag/_index.md b/content/learning-paths/servers-and-cloud-computing/rag/_index.md index 32e28a80da..530fe9e21b 100644 --- a/content/learning-paths/servers-and-cloud-computing/rag/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/rag/_index.md @@ -23,11 +23,11 @@ author: Nobel Chowdary Mandepudi ### Tags skilllevels: Advanced -cloud_service_providers: - - Google Cloud armips: - Neoverse subjects: ML +cloud_service_providers: + - Google Cloud operatingsystems: - Linux tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/redis/_index.md b/content/learning-paths/servers-and-cloud-computing/redis/_index.md index 52df6ae072..21d2ccff3d 100644 --- a/content/learning-paths/servers-and-cloud-computing/redis/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/redis/_index.md @@ -19,7 +19,10 @@ author: Elham Harirpoush skilllevels: Introductory subjects: Databases cloud_service_providers: + - AWS + - Microsoft Azure - Google Cloud + - Oracle armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/redis_cache/_index.md b/content/learning-paths/servers-and-cloud-computing/redis_cache/_index.md index 0c390a043b..4168eebc40 100644 --- a/content/learning-paths/servers-and-cloud-computing/redis_cache/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/redis_cache/_index.md @@ -19,6 +19,10 @@ author: Jason Andrews ### Tags skilllevels: Advanced subjects: Databases +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/redis_tune/_index.md b/content/learning-paths/servers-and-cloud-computing/redis_tune/_index.md index 9e7e41c663..89c619eec2 100644 --- a/content/learning-paths/servers-and-cloud-computing/redis_tune/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/redis_tune/_index.md @@ -20,7 +20,10 @@ author: Elham Harirpoush skilllevels: Advanced subjects: Databases cloud_service_providers: + - AWS + - Microsoft Azure - Google Cloud + - Oracle armips: - Neoverse tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/_index.md b/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/_index.md new file mode 100644 index 0000000000..574aba45f0 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/_index.md @@ -0,0 +1,52 @@ +--- +title: Use reproducible functions in Libamath (Arm Performance Libraries) + +draft: true +cascade: + draft: true + +minutes_to_complete: 10 +author: Joana Cruz + +who_is_this_for: This is an introductory topic for developers who want to produce reproducible code across vector extensions using math functions in Libamath, a component of Arm Performance Libraries. + +learning_objectives: + - Explain what numerical reproducibility means in numerical software + - Describe generic applications of numerical reproducibility in the industry + - Understand how reproducibility is defined in Libamath + - Enable and use reproducible Libamath functions in real applications + +prerequisites: + - An Arm computer running Linux with [Arm Performance Libraries](/install-guides/armpl/) version 26.01 or newer installed. + +### Tags +skilllevels: Introductory +subjects: Performance and Architecture +armips: + - Neoverse + - SVE +tools_software_languages: + - Arm Performance Libraries + - GCC + - LLVM + - Libamath +operatingsystems: + - Linux + +further_reading: + - resource: + title: ArmPL Libamath Documentation + link: https://developer.arm.com/documentation/101004/2601/Arm-Performance-Libraries-Math-Functions + type: documentation + - resource: + title: ArmPL Installation Guide on Linux + link: /install-guides/armpl/#linux + type: website + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/_next-steps.md new file mode 100644 index 0000000000..727b395ddd --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # The weight controls the order of the pages. _index.md always has weight 1. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/applications.md b/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/applications.md new file mode 100644 index 0000000000..741f10aa06 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/applications.md @@ -0,0 +1,56 @@ +--- +title: Applications of Reproducibility +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +# Applications of Reproducibility + + +Reproducibility is not required for every application, but it is critical in several important domains. Here are some examples. + +### Auto-vectorisation + +Modern compilers automatically vectorise scalar loops when possible. This means that, depending on the compiler decisions, the same source code may be executed as a scalar loop, as a Neon vectorized loop or as a SVE vectorised loop. + +Additionally, vectorised loops often include scalar tail handling for leftover elements that do not fill an entire vector. + +Reproducibility across math routines garantees that: + +* Vectorized loops (Neon or SVE) match regardless of which one is used + +* The result of loops over scalar routines matches the results of vectorised loops (Neon or SVE) + +* Changing vector width or enabling/disabling auto-vectorisation does not change the final output + + +### Distributed Computing + +In distributed or parallel workloads, computations are often decomposed across multiple machines or execution units. + +* Different nodes may execute scalar, Neon, or SVE code paths + +* The decomposition of work can change between runs + +* Without reproducible math routines, small numerical differences can accumulate and lead to divergent final results + +### Embedded and real-time systems + +In real-time environments, determinism is essential. + +* Bitwise-identical results simplify validation + +* Reproducibility ensures consistent behavior across software updates and hardware variants + +* Debugging and fault analysis become significantly easier + +### Gaming and simulation +Many games and simulations rely on deterministic numerical behavior. + +* Reproducibility enables lockstep simulations across threads or devices + +* It helps prevent desynchronization in multiplayer or replay systems + +* Deterministic math simplifies testing and debugging of complex numerical code diff --git a/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/examples.md b/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/examples.md new file mode 100644 index 0000000000..b821546204 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/examples.md @@ -0,0 +1,182 @@ +--- +title: Examples +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Example: Reproducible expf + +In this example, you will take a look into reproducibility Libamath usage, considering the case of a simple computation using the exponential function in single precision (`expf`). + + +#### Setting up your environment +In this example we use **`GCC-14`** compiler on a **`Neoverse V1`** machine. We use [ArmPL 26.01 module](/install-guides/armpl/). +You can setup some environment variables to make compilation commands simpler: + +```bash { command_line="root@localhost" } +export LD_LIBRARY_PATH=/lib:$LD_LIBRARY_PATH +export C_INCLUDE_PATH=/include +export LIBRARY_PATH=/lib +``` + +With this setup, you can compile the examples to use the reproducible Libamath library via: +```bash { command_line="root@localhost" } +$CC app.c -DAMATH_REPRO=1 -lamath_repro -o app +``` + +If in turn you are interested in using the non-reproducible Libamath library, you should compile with: +```bash { command_line="root@localhost" } +$CC app.c -lamath -o app +``` + +Note that that this only works if `app.c` only contains functions that are present in both versions of the library (`libamath_repro.a` contains a subset of functions in `libamath.a`). + +You can run examples via: + +```bash { command_line="root@localhost" } +./app +``` + +For `SVE` applications, add `-march=armv8-a+sve` to the compilation command. For example: + +```bash { command_line="root@localhost" } +$CC app.c -DAMATH_REPRO=1 -lamath_repro -march=armv8-a+sve -o app +``` + + +#### Scalar usage + +Our starting point is a small application that uses the scalar implementation of the single precision exponential function `armpl_exp_f32`. +Below you can find the example-code, the output when reproducibility is enabled versus when reproducibility is disabled. + +{{< tabpane code=true >}} + + {{< tab header="C Application" language="C" output_lines="10">}} +#include +#include + +int main(void) +{ + float y = armpl_exp_f32(0x1.ebe93cp-1f); // 0.960763812065125 + printf("y = %.15f [%a]\n", y, y); + return 0; +} + {{< /tab >}} + + {{< tab header="Output (repro enabled)" language="bash">}} +y = 2.613692283630371 [0x1.4e8d78p+1] + {{< /tab >}} + + {{< tab header="Output (repro disabled)" language="bash">}} +y = 2.613692045211792 [0x1.4e8d76p+1] + {{< /tab >}} + +{{< /tabpane >}} + + +#### Neon usage + +Now we build a simple Neon application that invokes the reproducible Neon implementation of the single precision exponential function `armpl_vexpq_f32`. + +{{< tabpane code=true >}} + {{< tab header="C" language="C" output_lines="15">}} +#include +#include +#include + +int main(void) +{ + float32x4_t x = vdupq_n_f32(0x1.ebe93cp-1f); // 0.960763812065125 + float32x4_t y = armpl_vexpq_f32(x); + + printf("y (lane 0) = %.15f [%a]\n", vgetq_lane_f32(y, 0), vgetq_lane_f32(y, 0)); + printf("y (lane 1) = %.15f [%a]\n", vgetq_lane_f32(y, 1), vgetq_lane_f32(y, 1)); + printf("y (lane 2) = %.15f [%a]\n", vgetq_lane_f32(y, 2), vgetq_lane_f32(y, 2)); + printf("y (lane 3) = %.15f [%a]\n", vgetq_lane_f32(y, 3), vgetq_lane_f32(y, 3)); + return 0; +} + + + {{< /tab >}} + + {{< tab header="Output (repro enabled)" language="bash">}} +y (lane 0) = 2.613692283630371 [0x1.4e8d78p+1] +y (lane 1) = 2.613692283630371 [0x1.4e8d78p+1] +y (lane 2) = 2.613692283630371 [0x1.4e8d78p+1] +y (lane 3) = 2.613692283630371 [0x1.4e8d78p+1] + {{< /tab >}} + + + {{< tab header="Output (repro disabled)" language="bash">}} +y (lane 0) = 2.613692283630371 [0x1.4e8d78p+1] +y (lane 1) = 2.613692283630371 [0x1.4e8d78p+1] +y (lane 2) = 2.613692283630371 [0x1.4e8d78p+1] +y (lane 3) = 2.613692283630371 [0x1.4e8d78p+1] + {{< /tab >}} +{{< /tabpane >}} + + +Once we run this example, each lane of `y` will contain the same bit pattern as the scalar result of `armpl_exp_f32(1.0f)` (as you can see in the *Output* tab). + +#### SVE usage +Finally we build a simple SVE application that invokes the reproducible SVE implementation of the single precision exponential function `armpl_svexp_f32_x`. + +{{< tabpane code=true >}} + {{< tab header="C" language="C" output_lines="15">}} +#include +#include +#include + +int main(void) +{ + svbool_t pg = svptrue_b32(); + svfloat32_t x = svdup_f32(0x1.ebe93cp-1f); // 0.960763812065125 + svfloat32_t y = armpl_svexp_f32_x(x, pg); + float result[svcntw()]; + svst1(pg, result, y); + for (int i = 0; i < svcntw(); i++) { + printf("y (lane %d): %.15f [%a]\n", i, result[i], result[i]); + } + return 0; +} + {{< /tab >}} + + {{< tab header="Output (repro enabled)" language="bash">}} +y (lane 0): 2.613692283630371 [0x1.4e8d78p+1] +y (lane 1): 2.613692283630371 [0x1.4e8d78p+1] +y (lane 2): 2.613692283630371 [0x1.4e8d78p+1] +y (lane 3): 2.613692283630371 [0x1.4e8d78p+1] +y (lane 4): 2.613692283630371 [0x1.4e8d78p+1] +y (lane 5): 2.613692283630371 [0x1.4e8d78p+1] +y (lane 6): 2.613692283630371 [0x1.4e8d78p+1] +y (lane 7): 2.613692283630371 [0x1.4e8d78p+1] + {{< /tab >}} + + {{< tab header="Output (repro disabled)" language="bash">}} +y (lane 0): 2.613692045211792 [0x1.4e8d76p+1] +y (lane 1): 2.613692045211792 [0x1.4e8d76p+1] +y (lane 2): 2.613692045211792 [0x1.4e8d76p+1] +y (lane 3): 2.613692045211792 [0x1.4e8d76p+1] +y (lane 4): 2.613692045211792 [0x1.4e8d76p+1] +y (lane 5): 2.613692045211792 [0x1.4e8d76p+1] +y (lane 6): 2.613692045211792 [0x1.4e8d76p+1] +y (lane 7): 2.613692045211792 [0x1.4e8d76p+1] + {{< /tab >}} +{{< /tabpane >}} + +All active lanes of `y` are guaranteed to match the scalar and Neon results exactly. + +#### Scope and Limitations + +In this section we observed that, when reproducibility is enabled (`AMATH_REPRO` enabled), `expf` produces bitwise-identical results whether it is executed as a scalar, Neon or SVE function. + +This behaviour extends to other reproducible math routines in reproducible Libamath: + +* Scalar, Neon, and SVE implementations are numerically aligned +* Only functions listed in `amath_repro.h` are reproducible +* Reproducible symbols are always prefixed by `armpl_`. They are not provided with `ZGV` mangling. +* Reproducibility is provided on Linux platforms +* Results are independent of vector width or instruction selection +* Reproducible routines prioritize determinism over peak performance \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/reproducibility.md b/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/reproducibility.md new file mode 100644 index 0000000000..f937e10830 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/reproducibility.md @@ -0,0 +1,39 @@ +--- +title: Reproducibility +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## What is Reproducibility? + +In numerical software, reproducibility (also refered to as determinism) means you get the exact same floating-point bits for the same inputs — even if you run a different implementation (scalar vs Neon vs SVE). + +In pure mathematics, two functions `𝑓(𝑥)` and `𝑔(𝑥)` are equivalent if, for all `𝑥` in their domain `𝑓(𝑥) = 𝑔(𝑥)`. + + +In practice, numerical software replaces continuous mathematical functions over real numbers with discrete approximations using floating-point numbers. Instead of comparing two abstract functions, we compare two implementations. For example, a scalar version and a vectorized version of the same routine. + +We say that two programs are reproducible if, for the same input values, they produce exactly the same floating-point results, down to the last bit. + +{{% notice Accuracy vs Reproducibility %}} + +Note that this requirement is **independent** of the accuracy requirement: two results can both be within an acceptable error bound and still differ in their bit patterns. + +However correctly rounded routines (maximum error under 0.5ULP) are reproducible by essence, since for a given input, rounding mode and precision, the output is the floating-point number closest to the exact mathematical result. + +{{% /notice %}} + +## Levels of Reproducibility + +Reproducibility can be defined at different levels, depending on how similar or different the execution environments are: + +* **Cross-architecture reproducibility** + + Reproducibility across different processor architectures, such as x86 and AArch64. +* **Cross-vector-extension reproducibility** + + Reproducibility across different vector execution paths on the same architecture, such as scalar, Neon, and SVE on AArch64. + +In this Learning Path, we focus on cross-vector-extension reproducibility (scalar, Neon, SVE on AArch64).” \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/reproducibility_libamath.md b/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/reproducibility_libamath.md new file mode 100644 index 0000000000..55ee75ca42 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/reproducibility_libamath.md @@ -0,0 +1,54 @@ +--- +title: Reproducibility in Libamath +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Cross-vector-extension reproducibility + +On Linux platforms, Libamath supports bitwise-reproducible results across scalar, Neon (AdvSIMD), and SVE implementations for a subset of math functions. + +When reproducibility is enabled, the same input values produce identical floating-point results, regardless of whether a supported function is executed using the scalar, Neon, or SVE code path. This keeps your results deterministic even if your app takes different vector paths. + +Reproducible Libamath routines operate in the default accuracy mode, guaranteeing results within 3.5 ULP of the correctly rounded value. + +Note that reproducible routines prioritize determinism over peak performance. + +## Reproducible symbols + +When reproducibility is enabled: + +* Reproducible functions use the same public function names as their non-reproducible counterparts + +* The linker resolves calls to the reproducible implementations when you build with `-DAMATH_REPRO=1` and link `-lamath_repro` + +* Scalar, Neon, and SVE variants of a function all produce bitwise-identical results + +* Unlike the symbols you find in `amath.h` (which don't guarantee reproducibility), reproducible symbols in `amath_repro` are not provided in `ZGV` mangling (only the `armpl_` notation is used). + +The full list of functions that support reproducible behavior is provided in the header file `amath_repro.h` + +## How to use reproducible Libamath + +To enable reproducibility in a C or C++ application: + +1. Include the Libamath header + +```C +#include +``` + +2. Compile with reproducibility enabled + +```bash +-DAMATH_REPRO=1 +``` + +3. Link against the reproducible Libamath library +```bash +-lamath_repro +``` + +When you follow these steps, calls to supported functions resolve to the reproducible scalar, Neon, or SVE implementations. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/rtp-llm/_index.md b/content/learning-paths/servers-and-cloud-computing/rtp-llm/_index.md index 10c8d1393d..d81741888e 100644 --- a/content/learning-paths/servers-and-cloud-computing/rtp-llm/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/rtp-llm/_index.md @@ -19,6 +19,11 @@ author: Tianyu Li ### Tags skilllevels: Introductory subjects: ML +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/snappy/_index.md b/content/learning-paths/servers-and-cloud-computing/snappy/_index.md index 6379fd17e7..f1d99399bf 100644 --- a/content/learning-paths/servers-and-cloud-computing/snappy/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/snappy/_index.md @@ -25,6 +25,9 @@ test_maintenance: true ### Tags skilllevels: Introductory subjects: Libraries +cloud_service_providers: + - AWS + - Oracle armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/snort3-multithreading/_index.md b/content/learning-paths/servers-and-cloud-computing/snort3-multithreading/_index.md index ea4e2a1953..fce28125a7 100644 --- a/content/learning-paths/servers-and-cloud-computing/snort3-multithreading/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/snort3-multithreading/_index.md @@ -20,6 +20,11 @@ author: Preema Merlin Dsouza ### Tags skilllevels: Introductory subjects: Libraries +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/supervisord/_index.md b/content/learning-paths/servers-and-cloud-computing/supervisord/_index.md index 29db78af1b..5fbbca7a44 100644 --- a/content/learning-paths/servers-and-cloud-computing/supervisord/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/supervisord/_index.md @@ -19,6 +19,8 @@ author: Jason Andrews ### Tags skilllevels: Introductory subjects: Performance and Architecture +cloud_service_providers: + - AWS armips: - Neoverse - Cortex-A diff --git a/content/learning-paths/servers-and-cloud-computing/sve/_index.md b/content/learning-paths/servers-and-cloud-computing/sve/_index.md index 1774ba2c94..86c07faa09 100644 --- a/content/learning-paths/servers-and-cloud-computing/sve/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/sve/_index.md @@ -19,6 +19,11 @@ author: Florent Lebeau ### Tags skilllevels: Introductory subjects: ML +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse - Cortex-A diff --git a/content/learning-paths/servers-and-cloud-computing/sve2-match/_index.md b/content/learning-paths/servers-and-cloud-computing/sve2-match/_index.md index cfb3cb802e..d8b4e77243 100644 --- a/content/learning-paths/servers-and-cloud-computing/sve2-match/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/sve2-match/_index.md @@ -22,6 +22,10 @@ author: Pareena Verma ### Tags skilllevels: Introductory subjects: Performance and Architecture +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/sysreport/_index.md b/content/learning-paths/servers-and-cloud-computing/sysreport/_index.md index de5480792e..a1fa7ec098 100644 --- a/content/learning-paths/servers-and-cloud-computing/sysreport/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/sysreport/_index.md @@ -18,6 +18,11 @@ author: James Whitaker ### Tags skilllevels: Introductory subjects: Performance and Architecture +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Cortex-A - Neoverse diff --git a/content/learning-paths/servers-and-cloud-computing/thirdai-sentiment-analysis/_index.md b/content/learning-paths/servers-and-cloud-computing/thirdai-sentiment-analysis/_index.md index eae80d8e9b..a382ac0ed9 100644 --- a/content/learning-paths/servers-and-cloud-computing/thirdai-sentiment-analysis/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/thirdai-sentiment-analysis/_index.md @@ -17,6 +17,11 @@ author: ThirdAI ### Tags skilllevels: Introductory subjects: ML +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/_index.md b/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/_index.md index bc05efc48c..38457558a2 100644 --- a/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/_index.md @@ -1,13 +1,9 @@ --- title: Deploy a live sensor dashboard with TimescaleDB and Grafana on Google Cloud C4A - -draft: true -cascade: - draft: true minutes_to_complete: 45 -who_is_this_for: This learning path is for DevOps engineers, database engineers, and software developers who want to deploy and operate TimescaleDB on SUSE Linux Enterprise Server (SLES) Arm64, ingest live time-series sensor data, and visualize it in Grafana. +who_is_this_for: This is an introductory topic for DevOps engineers, database engineers, and software developers who want to deploy and operate TimescaleDB on SUSE Linux Enterprise Server (SLES) Arm64, ingest live time-series sensor data, and visualize it in Grafana. learning_objectives: - Install and configure TimescaleDB on Google Cloud C4A Axion processors by building from source for Arm64 @@ -24,7 +20,7 @@ author: Pareena Verma ##### Tags skilllevels: Introductory subjects: Databases -cloud_service_providers: +cloud_service_providers: - Google Cloud armips: @@ -70,5 +66,4 @@ layout: "learningpathall" learning_path_main_page: yes --- -TimescaleDB is a high-performance, open-source time-series database built on PostgreSQL that provides powerful features for storing, querying, and analyzing time-series data efficiently. When you deploy TimescaleDB on Google Cloud C4A Axion Arm-based processors, you can achieve high-throughput time-series ingestion and query processing with optimized performance per watt and lower infrastructure costs. This Learning Path shows you how to build a complete time-series data pipeline with live sensor ingestion and real-time visualization using Grafana. diff --git a/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/background.md b/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/background.md index a9f20c4245..92c922100d 100644 --- a/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/background.md +++ b/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/background.md @@ -29,10 +29,10 @@ To learn more, visit the [TimescaleDB website](https://www.timescale.com/) and e ## What you've accomplished and what's next -In this section, you learned about: +In this section, you: -* Google Axion C4A Arm-based VMs and their performance advantages for time-series workloads -* TimescaleDB and its key features, including hypertables, continuous aggregates, and retention policies -* How Arm architecture enables cost-efficient, high-throughput ingestion and query processing for time-series data +- Explored Google Axion C4A Arm-based VMs and their performance advantages for time-series workloads +- Reviewed TimescaleDB key features, including hypertables, continuous aggregates, and retention policies +- Understood how Arm architecture enables cost-efficient, high-throughput ingestion and query processing -Next, you'll create firewall rules to enable remote access to the Grafana dashboard that you'll build later in this Learning Path. +Next, you'll create a firewall rule to enable remote access to the Grafana dashboard you'll build later in this Learning Path. diff --git a/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/firewall-setup.md b/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/firewall-setup.md index ab9f33bd40..7cf02546c0 100644 --- a/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/firewall-setup.md +++ b/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/firewall-setup.md @@ -9,31 +9,31 @@ layout: learningpathall Create a firewall rule in Google Cloud Console to expose TCP port 3000 for the TimescaleDB (Grafana) management interface. {{% notice Note %}} -For support on GCP setup, see the Learning Path [Getting started with Google Cloud Platform](/learning-paths/servers-and-cloud-computing/csp/google/). +For help with GCP setup, see the Learning Path [Getting started with Google Cloud Platform](/learning-paths/servers-and-cloud-computing/csp/google/). {{% /notice %}} ## Configure the firewall rule Navigate to the [Google Cloud Console](https://console.cloud.google.com/), go to **VPC Network > Firewall**, and select **Create firewall rule**. -![Create a firewall rule](images/firewall-rule.png "Create a firewall rule") +![Google Cloud Console VPC Network Firewall page showing the Create firewall rule button in the top menu bar alt-txt#center](images/firewall-rule.png "Create a firewall rule in Google Cloud Console") Next, create the firewall rule that exposes TCP port 3000. Set the **Name** of the new rule to "allow-tcp-3000". Select your network that you intend to bind to your VM (default is "autoscaling-net" but your organization might have others). Set **Direction of traffic** to "Ingress". Set **Allow on match** to "Allow" and **Targets** to "Specified target tags". Enter "allow-tcp-3000" in the **Target tags** text field. Set **Source IPv4 ranges** to "0.0.0.0/0". -![Create a firewall rule](images/network-rule.png "Creating the TCP/3000 firewall rule") +![Google Cloud Console Create firewall rule form with Name set to allow-tcp-3000, Direction of traffic set to Ingress, and Target tags field showing allow-tcp-3000 alt-txt#center](images/network-rule.png "Configuring the allow-tcp-3000 firewall rule") Finally, select **Specified protocols and ports** under the **Protocols and ports** section. Select the **TCP** checkbox, enter "3000" in the **Ports** text field, and select **Create**. -![Specifying the TCP port to expose](images/network-port.png "Specifying the TCP port to expose") +![Google Cloud Console Protocols and ports section with the TCP checkbox selected and port 3000 entered in the Ports text field alt-txt#center](images/network-port.png "Setting TCP port 3000 in the firewall rule") ## What you've accomplished and what's next -You've successfully: +In this section, you: -* Created a firewall rule in Google Cloud to expose port 3000 for Grafana web interface access -* Configured network ingress rules to allow remote connections to your dashboard +- Created a firewall rule to expose TCP port 3000 for Grafana web interface access +- Configured network ingress rules to allow remote connections from any source IP Next, you'll provision a Google Axion C4A Arm virtual machine and apply this firewall rule to enable external access to Grafana. diff --git a/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/grafana-timescaledb-setup.md b/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/grafana-timescaledb-setup.md index 7e619647d7..f3ce1a51bf 100644 --- a/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/grafana-timescaledb-setup.md +++ b/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/grafana-timescaledb-setup.md @@ -1,18 +1,16 @@ --- -title: Grafana Setup & TimescaleDB Data Source Configuration +title: Install Grafana and configure the TimescaleDB data source weight: 7 ### FIXED, DO NOT MODIFY layout: learningpathall --- -## Grafana Setup & TimescaleDB Data Source +## Install and configure Grafana In this section, you install Grafana on a SUSE Arm64 virtual machine, access its web interface, and connect it to TimescaleDB. Grafana acts as the visualization layer that queries TimescaleDB and displays time-series data in dashboards. -This setup enables real-time monitoring and analytics of sensor or application data stored in TimescaleDB. - -## Architecture (At This Stage) +This setup enables real-time monitoring and analytics of sensor data stored in TimescaleDB. ```text Python Sensor Ingest Script @@ -24,7 +22,7 @@ TimescaleDB (PostgreSQL) Grafana Dashboard ``` -## Install Grafana on SUSE (Arm64) +## Install Grafana on SUSE Grafana is available via RPM packages and works natively on Arm64. @@ -42,7 +40,7 @@ sudo systemctl start grafana-server sudo systemctl enable grafana-server ``` -**Verify status:** +Verify the service is running: ```bash sudo systemctl status grafana-server @@ -56,7 +54,7 @@ The output is similar to: Active: active (running) since Tue 2026-02-17 08:57:45 UTC; 1h 31min ago ```` -## Access Grafana Web UI +## Access the Grafana web UI Open your browser and navigate to: @@ -64,7 +62,7 @@ Open your browser and navigate to: http://:3000 ``` -### Default Login Credentials +### Default login credentials | Field | Value | | -------- | ----- | @@ -73,21 +71,17 @@ http://:3000 You will be prompted to change the password on first login. Provide and save off a new password. Re-login if needed using the new password: -![Grafana login page#center](images/grafana-login-page.png "Grafana login page") +![Grafana login page showing username and password fields with a Sign In button alt-txt#center](images/grafana-login-page.webp "Grafana login page") You will be presented with the main dashboard for Grafana: -![Grafana dashboard#center](images/grafana-dashboard.png "Grafana dashboard") - -## Add TimescaleDB as a Data Source +![Grafana main dashboard showing the Home screen with sidebar navigation including Dashboards, Explore, and Connections menu items alt-txt#center](images/grafana-dashboard.webp "Grafana main dashboard") -### Step 1: Open Data Sources +## Add TimescaleDB as a data source -From the Grafana sidebar: +### Step 1: Open data sources -```bash -Connections → Data sources → Add data source -``` +From the Grafana sidebar, navigate to **Connections** → **Data sources** → **Add data source**. ### Step 2: Choose PostgreSQL @@ -96,7 +90,7 @@ Select PostgreSQL (TimescaleDB is PostgreSQL-compatible). ![Add PostgreSQL data source in Grafana#center](images/psql-data-source.png "Add PostgreSQL data source") -### Step 3: Configure Connection Settings +### Step 3: Configure connection settings Fill the form exactly as below: @@ -108,24 +102,20 @@ Fill the form exactly as below: | Password | `` | | TLS/SSL Mode | `disable` | -![PostgreSQL data source connection settings#center](images/data-source-details.png "PostgreSQL data source settings") +![PostgreSQL data source connection settings alt-txt#center](images/data-source-details.png "PostgreSQL data source settings") -**Scroll down and click:** - -```text -Save & Test -``` +Scroll down and select **Save & Test**. You should see "Database connection OK." -![Grafana PostgreSQL data source save and test success#center](images/data-source-save-test.png "Grafana PostgreSQL data source save and test") +![Grafana PostgreSQL data source save and test success alt-txt#center](images/data-source-save-test.png "Grafana PostgreSQL data source save and test") -## What You Have Accomplished +## What you've accomplished and what's next -- Installed Grafana on SUSE Arm64 -- Logged into Grafana UI -- Connected Grafana to TimescaleDB successfully +In this section, you: -## What’s Next +- Installed Grafana on SUSE Arm64 and started the service +- Accessed the Grafana web UI and updated the default password +- Connected Grafana to TimescaleDB as a PostgreSQL data source -In the next section, you will create a live dashboard to visualize real-time sensor temperature data. +In the next section, you'll create a live dashboard to visualize real-time sensor temperature data. diff --git a/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/images/grafana-dashboard.png b/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/images/grafana-dashboard.png deleted file mode 100644 index d7e5099d01..0000000000 Binary files a/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/images/grafana-dashboard.png and /dev/null differ diff --git a/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/images/grafana-dashboard.webp b/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/images/grafana-dashboard.webp new file mode 100644 index 0000000000..4606e853d9 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/images/grafana-dashboard.webp differ diff --git a/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/images/grafana-login-page.png b/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/images/grafana-login-page.png deleted file mode 100644 index 7539219a81..0000000000 Binary files a/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/images/grafana-login-page.png and /dev/null differ diff --git a/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/images/grafana-login-page.webp b/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/images/grafana-login-page.webp new file mode 100644 index 0000000000..822fdb0af2 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/images/grafana-login-page.webp differ diff --git a/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/images/live-sensor-temperature.png b/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/images/live-sensor-temperature.png deleted file mode 100644 index 112b8bee4e..0000000000 Binary files a/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/images/live-sensor-temperature.png and /dev/null differ diff --git a/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/images/live-sensor-temperature.webp b/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/images/live-sensor-temperature.webp new file mode 100644 index 0000000000..92382b4340 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/images/live-sensor-temperature.webp differ diff --git a/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/instance.md b/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/instance.md index a28880fb76..798c6fc36f 100644 --- a/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/instance.md +++ b/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/instance.md @@ -18,26 +18,26 @@ For help with GCP setup, see the Learning Path [Getting started with Google Clou To create a virtual machine based on the C4A instance type: -* Navigate to the [Google Cloud Console](https://console.cloud.google.com/). -* Go to **Compute Engine > VM Instances** and select **Create Instance**. -* Under **Machine configuration**: - * Populate fields such as **Instance name**, **Region**, and **Zone**. - * Set **Series** to `C4A`. - * Select `c4a-standard-4` for machine type. +- Navigate to the [Google Cloud Console](https://console.cloud.google.com/). +- Go to **Compute Engine** > **VM Instances** and select **Create Instance**. +- Under **Machine configuration**: + - Populate fields such as **Instance name**, **Region**, and **Zone**. + - Set **Series** to `C4A`. + - Select `c4a-standard-4` for machine type. ![Screenshot of the Google Cloud Console showing the Machine configuration section. The Series dropdown is set to C4A and the machine type c4a-standard-4 is selected#center](images/gcp-vm.png "Configuring machine type to C4A in Google Cloud Console") -* Under **OS and storage**, select **Change**, and then choose an Arm64-based operating system image. - * For this Learning Path, select **SUSE Linux Enterprise Server**. - * For the license type, choose **Pay as you go**. - * Increase **Size (GB)** from **10** to **100** to allocate sufficient disk space. - * Select **Choose** to apply the changes. -* Under **Networking**, enable **Allow HTTP traffic** and **Allow HTTPS traffic**. Also, add the following tag "allow-tcp-3000" to enable remote grafana access. -* Select **Create** to launch the virtual machine. +- Under **OS and storage**, select **Change**, and then choose an Arm64-based operating system image. + - For this Learning Path, select **SUSE Linux Enterprise Server**. + - For the license type, choose **Pay as you go**. + - Increase **Size (GB)** from **10** to **100** to allocate sufficient disk space. + - Select **Choose** to apply the changes. +- Under **Networking**, enable **Allow HTTP traffic** and **Allow HTTPS traffic**. Also add the tag `allow-tcp-3000` to enable remote Grafana access. +- Select **Create** to launch the virtual machine. -After the instance starts, click **SSH** next to the VM in the instance list to open a browser-based terminal session. +After the instance starts, select **SSH** next to the VM in the instance list to open a browser-based terminal session. -![Google Cloud Console VM instances page displaying running instance with green checkmark and SSH button in the Connect column#center](images/gcp-pubip-ssh.png "Connecting to a running C4A VM using SSH") +![Google Cloud Console VM instances page displaying running instance with green checkmark and SSH button in the Connect column alt-txt#center](images/gcp-pubip-ssh.png "Connecting to a running C4A VM using SSH") A new browser window opens with a terminal connected to your VM. @@ -47,4 +47,4 @@ A new browser window opens with a terminal connected to your VM. In this section, you provisioned a Google Axion C4A Arm VM and connected to it using SSH. -Next, you'll need to install TimescaleDB and the required dependencies on your VM. +Next, you'll install TimescaleDB and the required dependencies on your VM. diff --git a/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/live-sensor-dashboard.md b/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/live-sensor-dashboard.md index 71b863de70..99068289b1 100644 --- a/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/live-sensor-dashboard.md +++ b/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/live-sensor-dashboard.md @@ -1,16 +1,14 @@ --- -title: Live Sensor Temperature Dashboard +title: Build a live sensor temperature dashboard weight: 8 ### FIXED, DO NOT MODIFY layout: learningpathall --- -## Create a Live Sensor Temperature Dashboard +## Create a live sensor temperature dashboard -In this section, you will create a **real-time Grafana dashboard** that visualizes **live temperature data** stored in **TimescaleDB**. - -This dashboard continuously updates and helps you monitor sensor temperature changes in near real time. +In this section, you'll create a Grafana dashboard that visualizes live temperature data stored in TimescaleDB. The dashboard continuously updates to display sensor temperature changes in near real time. ## Prerequisites @@ -42,8 +40,8 @@ Log in using your Grafana credentials. ## Create a New Dashboard - From the left sidebar, select **Dashboards** -- Click **New dashboard** -- Select Add a **new visualization** +- Select **New dashboard** +- Select **New visualization** You will be redirected to the Edit panel screen. @@ -51,13 +49,13 @@ You will be redirected to the Edit panel screen. In the Query section: -- Data source: PostgreSQL / TimescaleDB -- Query type: SQL -- Format: Time series +- **Data source:** PostgreSQL / TimescaleDB +- **Query type:** SQL +- **Format:** Time series -![Grafana visualization configuration showing time series panel settings#center](images/data-source-visualization.png "Grafana visualization configuration") +![Grafana time series panel editor showing the visualization configuration screen with the time series panel type selected and panel title field alt-txt#center](images/data-source-visualization.png "Grafana visualization configuration") -Paste the following query after pressing the "Code" button on the right of the query editor: +Paste the following query after selecting **Code** on the right of the query editor: ```sql SELECT @@ -76,14 +74,14 @@ Apply the following settings in the right-hand panel: - Visualization Settings - - Visualization: Time series - - Panel title: Live Sensor Temperature - - Table view: Disabled + - **Visualization:** Time series + - **Panel title:** Live Sensor Temperature + - **Table view:** Disabled - Time & Refresh Settings - - Time range: Last 5 minutes - - Refresh interval: 5s + - **Time range:** Last 5 minutes + - **Refresh interval:** 5s These settings ensure the panel refreshes automatically with new data. @@ -91,26 +89,16 @@ These settings ensure the panel refreshes automatically with new data. Once configured, the panel should display a continuously updating temperature graph. -![Grafana dashboard showing Live Sensor Temperature time series panel alt-txt#center](images/live-sensor-temperature.png "Live Sensor Temperature Panel") +![Grafana live sensor temperature time series panel showing a continuous line graph of temperature readings from multiple sensors over the last 5 minutes alt-txt#center](images/live-sensor-temperature.webp "Live sensor temperature panel") -## Save the Dashboard +## Save the dashboard -- Click Save dashboard (top-right corner) +- Select **Save dashboard** (top-right corner) - Enter a name, for example: Live Sensor Monitoring Dashboard -- Click Save +- Select **Save** The dashboard is now active. -## What you've accomplished and what's next - -You've successfully: - -- Built a complete time-series data pipeline on Google Cloud C4A Axion Arm-based processors -- Installed and configured TimescaleDB by building from source for optimal Arm64 performance -- Created a real-time sensor data ingestion pipeline using Python with hypertables and continuous aggregates -- Configured retention policies and indexes for production-ready time-series storage -- Installed Grafana and connected it to TimescaleDB as a data source -- Built a live sensor dashboard that automatically refreshes to display real-time temperature data -- Validated end-to-end data flow from ingestion through TimescaleDB to Grafana visualization +## What you've accomplished -You now have a production-ready time-series monitoring solution running natively on Arm infrastructure. This setup demonstrates how TimescaleDB and Grafana work together to provide comprehensive real-time monitoring for IoT and sensor workloads. You can extend this foundation by adding more sensors, creating additional dashboards, implementing alerting rules in Grafana, or optimizing TimescaleDB for your specific workload patterns. +You've built a complete time-series monitoring pipeline on Google Cloud C4A Axion Arm-based processors. TimescaleDB is running natively on Arm64, ingesting live sensor data through Python, and serving queries to Grafana for real-time visualization. From here you can add more sensors, create additional dashboards, set up alerting rules in Grafana, or tune TimescaleDB for your specific workload. diff --git a/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/sensor-data-ingestion.md b/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/sensor-data-ingestion.md index cf41a8f02a..3611ea9858 100644 --- a/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/sensor-data-ingestion.md +++ b/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/sensor-data-ingestion.md @@ -1,16 +1,16 @@ --- -title: Real-Time Sensor Data Ingestion on Arm64 +title: Ingest real-time sensor data on Arm64 weight: 6 ### FIXED, DO NOT MODIFY layout: learningpathall --- -## Real-Time Sensor Data Ingestion +## Ingest real-time sensor data In this section, you simulate real-time sensor data using Python and continuously ingest it into TimescaleDB running on an Arm64 VM. This creates a live time-series data stream that can later be visualized using Grafana. -## Architecture Overview +## Architecture overview ```text Python Sensor Generator @@ -25,7 +25,7 @@ TimescaleDB Hypertable This architecture mirrors real-world IoT and telemetry pipelines. -## Install Python Dependencies (SUSE) +## Install Python dependencies ```bash cd $HOME @@ -35,7 +35,7 @@ sudo zypper install -y \ python3-psycopg2 ``` -**Verify psycopg2:** +Verify psycopg2 is installed correctly: ```bash python3 - < ingest.log 2>&1 & @@ -118,7 +118,7 @@ nohup python3 sensor_ingest.py > ingest.log 2>&1 & This ensures the sensor generator continues running even after you close the terminal. -### Verify Data Ingestion +### Verify data ingestion ```bash ps -ef | grep sensor_ingest.py @@ -160,11 +160,13 @@ gcpuser@tsdb-suse-arm64:~> sudo -u postgres psql sensors -c "SELECT COUNT(*) FRO (1 row) ``` -## Time-Series Optimization +## Optimize time-series performance These steps make TimescaleDB production-ready. -### Create Index for Faster Queries +### Create an index for faster queries + +Connect to the sensors database and create an index optimized for time-range scans by sensor: ```bash sudo -u postgres psql sensors @@ -176,14 +178,11 @@ Issue the following SQL command: CREATE INDEX ON sensor_data (sensor_id, time DESC); ``` -This index: +This index improves Grafana query performance for time-range scans. -- Improves Grafana query performance -- Optimized for time-range scans +### Enable a data retention policy -### Enable Data Retention Policy - -Automatically remove old data after 7 days: +Automatically remove data older than seven days: ```sql SELECT add_retention_policy( @@ -192,10 +191,11 @@ SELECT add_retention_policy( ); ``` -- Prevents disk exhaustion -- Runs automatically in the background +This prevents disk exhaustion and runs automatically in the background. + +### Create a continuous aggregate -### Create Continuous Aggregate (Hourly Averages) +Precompute hourly averages per sensor for faster reporting: ```sql CREATE MATERIALIZED VIEW sensor_hourly_avg @@ -208,9 +208,9 @@ FROM sensor_data GROUP BY bucket, sensor_id; ``` -Precomputes hourly averages per sensor for faster reporting. +### Add an aggregate refresh policy -### Add Aggregate Refresh Policy +Automate hourly aggregate refresh every five minutes for near real-time analytics: ```sql SELECT add_continuous_aggregate_policy( @@ -221,9 +221,7 @@ SELECT add_continuous_aggregate_policy( ); ``` -Automates hourly aggregate refresh every 5 minutes for near real-time analytics. - -### What this means +The table below explains the three interval parameters: | Setting | Meaning | | ------- | ------------------ | @@ -231,7 +229,7 @@ Automates hourly aggregate refresh every 5 minutes for near real-time analytics. | 1 hour | Skip newest data | | 5 min | Refresh interval | -### Validate Optimization +### Validate the optimization ```sql SELECT * FROM sensor_hourly_avg LIMIT 5; @@ -260,11 +258,11 @@ postgres=# SELECT COUNT(*) FROM sensor_data; ``` -Please press "CTRL-D" to exit. +Press Ctrl+D to exit. ### Set the postgres password -Lets set a password for postgres: +Let's set a password for postgres so Grafana can connect in the next section: ```bash sudo -u postgres psql @@ -276,7 +274,7 @@ Then enter the new password: \password postgres ``` -Save the password as it will be used in the next section. Please press "CTRL-D" to exit. +Save the password — you'll need it when configuring the Grafana data source. Press Ctrl+D to exit. ## What you've accomplished and what's next diff --git a/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/setup-timescaledb.md b/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/setup-timescaledb.md index 003a5744ba..8cfce2a029 100644 --- a/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/setup-timescaledb.md +++ b/content/learning-paths/servers-and-cloud-computing/timescaledb-on-gcp/setup-timescaledb.md @@ -1,16 +1,16 @@ --- -title: TimescaleDB Environment Setup on Arm64 +title: Set up TimescaleDB on Arm64 weight: 5 ### FIXED, DO NOT MODIFY layout: learningpathall --- -## TimescaleDB Environment Setup +## Set up the TimescaleDB environment -In this section, you prepare an Arm64-based SUSE Linux Enterprise Server (SLES) virtual machine and install TimescaleDB by building it from source. Building from source ensures the database extension is fully optimized for the Arm64 architecture, which is especially important for high-ingest and time-series workloads. +In this section, you prepare an Arm64-based SUSE Linux Enterprise Server (SLES) virtual machine and install TimescaleDB by building it from source. Building from source ensures the database extension is fully optimized for Arm64, which is especially important for high-ingest and time-series workloads. -## Architecture Overview +## Architecture overview ```text Linux Arm64 VM (SUSE) @@ -24,7 +24,7 @@ TimescaleDB 2.25.0 Extension TimescaleDB provides time-series optimizations on top of PostgreSQL, making it ideal for high-ingest sensor workloads. -## Install Build Dependencies (SUSE) +## Install build dependencies TimescaleDB must be compiled against PostgreSQL, so development headers and build tools are required. @@ -44,30 +44,17 @@ sudo zypper install \ postgresql15-devel ``` -### Important (SUSE note) - -If you are prompted about `readline-devel`, choose **Solution 1 (vendor change/downgrade)**. - -**Why this matters:** - -- This avoids dependency conflicts on SUSE. -- It ensures compatibility with PostgreSQL development libraries. +{{% notice Note %}}If you are prompted about `readline-devel`, choose **Solution 1 (vendor change/downgrade)**. This avoids dependency conflicts on SUSE and ensures compatibility with PostgreSQL development libraries.{{% /notice %}} ## Initialize PostgreSQL -Before using PostgreSQL, its data directory must be initialized. +Before using PostgreSQL, its data directory must be initialized. The following command runs as the `postgres` system user to create the data directory, initialize system tables, and set default configurations: ```bash sudo -u postgres initdb -D /var/lib/pgsql/data ``` -**What this does:** - -- Creates the PostgreSQL data directory. -- Initializes system tables and default configurations. -- Runs as the postgres system user for security. - -**Enable and start PostgreSQL:** +Enable and start PostgreSQL: ```bash sudo systemctl enable postgresql @@ -75,7 +62,7 @@ sudo systemctl start postgresql ``` -**Verify PostgreSQL:** +Verify PostgreSQL is running: ```bash psql --version @@ -87,21 +74,20 @@ The output is similar to: psql (PostgreSQL) 15.10 ``` -## Build TimescaleDB from Source (Arm64) +## Build TimescaleDB from source Building TimescaleDB from source ensures native Arm64 compilation and optimal performance. ### Clone the repository +Download the official TimescaleDB source code and check out version 2.25.0 to ensure version consistency throughout this Learning Path: + ```bash git clone https://github.com/timescale/timescaledb.git cd timescaledb git checkout 2.25.0 ``` -- Download the official TimescaleDB source code. -- Check out version 2.25.0 to ensure version consistency throughout the learning path - {{% notice Note %}} According to the [release notes](https://github.com/timescale/timescaledb/releases/tag/2.16.0), TimescaleDB 2.16.0 introduces performance optimizations for DML on compressed chunks, improving upsert operations by **100×** and update/delete operations by **1000×** in some cases. @@ -130,7 +116,7 @@ TimescaleDB must be preloaded when PostgreSQL starts. ### Edit PostgreSQL configuration and add the timescaledb library -Using a suitable editor and "sudo", edit **/var/lib/pgsql/data/postgresql.conf** and add the following line: +Using a suitable editor and `sudo`, edit `/var/lib/pgsql/data/postgresql.conf` and add the following line: ```text shared_preload_libraries = 'timescaledb' @@ -147,9 +133,9 @@ This update: sudo systemctl restart postgresql ``` -## Create Database and Enable Extension +## Create a database and enable the extension -Now you enable TimescaleDB at the database level. +Enable TimescaleDB at the database level by creating the `sensors` database and loading the extension: ```bash sudo -u postgres psql @@ -161,13 +147,7 @@ CREATE DATABASE sensors; CREATE EXTENSION IF NOT EXISTS timescaledb; ``` -What this does: - -- Creates a database named sensors. -- Switches to the sensors database. -- Enables TimescaleDB features within that database. - -**Verify version:** +Verify the installed version: ```psql SELECT extversion FROM pg_extension WHERE extname='timescaledb'; @@ -183,12 +163,9 @@ sensors=# SELECT extversion FROM pg_extension WHERE extname='timescaledb'; (1 row) ``` -Press "CTRL-D" to exit. - -**What this confirms:** +Press Ctrl+D to exit. -- TimescaleDB is installed correctly. -- The expected version is active in the database. +This confirms that TimescaleDB is installed correctly and the expected version is active. ## What you've accomplished and what's next diff --git a/content/learning-paths/servers-and-cloud-computing/torchbench/_index.md b/content/learning-paths/servers-and-cloud-computing/torchbench/_index.md index 2095e32d13..532d80c30a 100644 --- a/content/learning-paths/servers-and-cloud-computing/torchbench/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/torchbench/_index.md @@ -18,6 +18,11 @@ author: Pareena Verma ### Tags skilllevels: Introductory subjects: ML +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/using-and-porting-performance-libs/_index.md b/content/learning-paths/servers-and-cloud-computing/using-and-porting-performance-libs/_index.md index 661384b9af..92be71c700 100644 --- a/content/learning-paths/servers-and-cloud-computing/using-and-porting-performance-libs/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/using-and-porting-performance-libs/_index.md @@ -18,6 +18,11 @@ author: Kieran Hejmadi ### Tags skilllevels: Introductory subjects: Performance and Architecture +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/vLLM-quant/_index.md b/content/learning-paths/servers-and-cloud-computing/vLLM-quant/_index.md index 4078ce4a98..3800c186d8 100644 --- a/content/learning-paths/servers-and-cloud-computing/vLLM-quant/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/vLLM-quant/_index.md @@ -29,6 +29,11 @@ author: ### Tags skilllevels: Introductory subjects: ML +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/vectorscan/_index.md b/content/learning-paths/servers-and-cloud-computing/vectorscan/_index.md index 78f981fdb2..722a058137 100644 --- a/content/learning-paths/servers-and-cloud-computing/vectorscan/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/vectorscan/_index.md @@ -19,6 +19,11 @@ author: Pareena Verma ### Tags skilllevels: Introductory subjects: Libraries +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse tools_software_languages: diff --git a/content/learning-paths/servers-and-cloud-computing/vllm-acceleration/_index.md b/content/learning-paths/servers-and-cloud-computing/vllm-acceleration/_index.md index be2944f312..591fbc0b15 100644 --- a/content/learning-paths/servers-and-cloud-computing/vllm-acceleration/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/vllm-acceleration/_index.md @@ -25,6 +25,9 @@ skilllevels: Introductory subjects: ML cloud_service_providers: - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/vllm/_index.md b/content/learning-paths/servers-and-cloud-computing/vllm/_index.md index 622d248dcf..4231effc49 100644 --- a/content/learning-paths/servers-and-cloud-computing/vllm/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/vllm/_index.md @@ -19,6 +19,11 @@ author: Jason Andrews ### Tags skilllevels: Introductory subjects: ML +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle armips: - Neoverse operatingsystems: diff --git a/content/learning-paths/servers-and-cloud-computing/vvenc/_index.md b/content/learning-paths/servers-and-cloud-computing/vvenc/_index.md index 978ea58333..e93891e74a 100644 --- a/content/learning-paths/servers-and-cloud-computing/vvenc/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/vvenc/_index.md @@ -22,6 +22,11 @@ operatingsystems: skilllevels: Introductory subjects: Libraries +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle tools_software_languages: - vvenc diff --git a/content/learning-paths/servers-and-cloud-computing/whisper/_index.md b/content/learning-paths/servers-and-cloud-computing/whisper/_index.md index 26bc93c01e..2842c87f36 100644 --- a/content/learning-paths/servers-and-cloud-computing/whisper/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/whisper/_index.md @@ -25,6 +25,11 @@ skilllevels: Introductory armips: - Neoverse subjects: ML +cloud_service_providers: + - AWS + - Microsoft Azure + - Google Cloud + - Oracle operatingsystems: - Linux tools_software_languages: @@ -33,8 +38,6 @@ tools_software_languages: - Demo - Hugging Face -cloud_service_providers: - - AWS further_reading: diff --git a/hugo-server.sh b/hugo-server.sh index 7c3198adef..0c120cd5de 100755 --- a/hugo-server.sh +++ b/hugo-server.sh @@ -5,41 +5,6 @@ # ----------------------------------------------------------------------------- hugo --buildDrafts -# ============================================================================= -# Enable the home page search box. -# ----------------------------------------------------------------------------- -# Attempt to use the system's pagefind if it is available on PATH or as an alias, -# otherwise default to our own local version of it in order to generate the -# search index data. - -# Get ourselves a useable pagefind. -PAGEFIND=pagefind -if ! command $PAGEFIND --version &> /dev/null; then - case "$(uname -s)" in - Darwin*) - PAGEFIND=bin/pagefind.arm64 - ;; - Linux*) - if [ "$(uname -m)" == "aarch64" ]; then - PAGEFIND=bin/pagefind.aarch64 - else - PAGEFIND=bin/pagefind - fi - ;; - MINGW*|CYGWIN*|MSYS_NT*) - PAGEFIND=bin/pagefind.exe - ;; - *) - echo "No pagefind executable found or known for this platform" - PAGEFIND="" - esac -fi - -# If we have a pagefind executable, generate the search index. -if [[ -n "$PAGEFIND" ]]; then - $PAGEFIND --site "public" --output-subdir ../static/pagefind -fi - # ============================================================================= # Serve our local tree for interactive development. # ----------------------------------------------------------------------------- diff --git a/pagefind.yml b/pagefind.yml deleted file mode 100644 index b5511dd9e0..0000000000 --- a/pagefind.yml +++ /dev/null @@ -1,2 +0,0 @@ -glob: "**/[!_]*/*.html" -