From 4e3ffb217c7273c44829b24ac2fafbe680447050 Mon Sep 17 00:00:00 2001 From: SergeyMenshykh Date: Thu, 14 May 2026 20:15:06 +0100 Subject: [PATCH] Python: Parse YAML block scalars in SKILL.md frontmatter The frontmatter parser previously matched only single-line `key: value` pairs, so block scalar indicators (`|` literal, `>` folded, with chomping `-`/`+`) were silently truncated to the indicator character. Multi-line descriptions like `description: >\n ...` lost their content. Add `_parse_yaml_scalar_value()` which detects block scalar indicators, collects indented continuation lines, strips the common leading indentation, joins per scalar style (newlines for `|`, spaces for `>`), and applies chomping per the YAML 1.2 spec. Update `_extract_frontmatter()` to use the helper for unquoted values. Adds 15 unit tests covering literal/folded styles, all chomping variants, indentation handling, content containing colons, non-description fields, tab indentation, blank-line preservation, and a regression test for plain values. Fixes #5713. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../packages/core/agent_framework/_skills.py | 95 ++++++++++- .../packages/core/tests/core/test_skills.py | 151 ++++++++++++++++-- 2 files changed, 236 insertions(+), 10 deletions(-) diff --git a/python/packages/core/agent_framework/_skills.py b/python/packages/core/agent_framework/_skills.py index c1d0c77e45..ba550e7095 100644 --- a/python/packages/core/agent_framework/_skills.py +++ b/python/packages/core/agent_framework/_skills.py @@ -1513,6 +1513,97 @@ def __call__( # must not start or end with a hyphen, and must not contain consecutive hyphens. VALID_NAME_RE = re.compile(r"^[a-z0-9]([a-z0-9]*-[a-z0-9])*[a-z0-9]*$") +# Block scalar indicator characters recognised by the lightweight YAML parser. +_BLOCK_SCALAR_INDICATORS = ("|", ">") + + +def _parse_yaml_scalar_value(yaml_content: str, kv_match: re.Match[str]) -> str: + """Resolve the scalar value for an unquoted YAML key-value match. + + If the captured value starts with a YAML block scalar indicator (``|`` or + ``>``), the function reads subsequent indented continuation lines, strips + the common leading indentation, and joins them according to the scalar + style (literal preserves newlines, folded replaces them with spaces). + + Chomping indicators are respected per YAML 1.2 §8.1.1.2: + + * ``-`` (strip) — final line break and trailing empty lines excluded + * ``+`` (keep) — final line break and any trailing empty lines preserved + * default (clip) — final line break preserved, trailing empty lines excluded + + For plain (non-block-scalar) values the captured text is returned as-is. + Note: explicit indentation indicators (e.g. ``|2``) are not supported; + indentation is auto-detected from the common leading whitespace. + """ + value: str = kv_match.group(3) + + if not value or value[0] not in _BLOCK_SCALAR_INDICATORS: + return value + + scalar_style = value[0] + keep_trailing_newline = len(value) > 1 and value[1] == "+" + strip_trailing_newline = len(value) > 1 and value[1] == "-" + + # Find the start of the next line after this key-value match. + next_line_start = yaml_content.find("\n", kv_match.end()) + if next_line_start < 0: + return value + next_line_start += 1 # skip the newline character itself + + # Collect indented continuation lines (or blank lines within the block). + block_lines: list[str] = [] + pos = next_line_start + while pos < len(yaml_content): + line_end = yaml_content.find("\n", pos) + if line_end < 0: + line = yaml_content[pos:] + line_end = len(yaml_content) + else: + line = yaml_content[pos:line_end] + + if not line or line.isspace(): + # Blank / whitespace-only lines are part of the block. + block_lines.append("") + pos = line_end + 1 if line_end < len(yaml_content) else line_end + continue + + if line[0] not in (" ", "\t"): + # Non-indented, non-blank line — end of the block. + break + + block_lines.append(line) + pos = line_end + 1 if line_end < len(yaml_content) else line_end + + # Strip trailing blank lines collected from the block. + while block_lines and block_lines[-1] == "": + block_lines.pop() + + if not block_lines: + return "" + + # Determine the common leading indentation across non-empty lines. + # Only space/tab characters count as indentation (matches YAML semantics). + def _indent_width(s: str) -> int: + i = 0 + while i < len(s) and s[i] in (" ", "\t"): + i += 1 + return i + + common_indent = min(_indent_width(line) for line in block_lines if line) + normalized = [line[common_indent:] if line else "" for line in block_lines] + + # Literal preserves newlines; folded joins non-empty lines with spaces. + parsed = "\n".join(normalized) if scalar_style == "|" else " ".join(line for line in normalized if line) + + if keep_trailing_newline: + return parsed + "\n" + if strip_trailing_newline: + return parsed + # Clip (default): literal gets a trailing newline, folded does not. + if scalar_style == "|": + return parsed + "\n" + return parsed + # Default system prompt template for advertising available skills to the model. # Use {skills} as the placeholder for the generated skills XML list. @@ -2879,7 +2970,9 @@ def _extract_frontmatter( for kv_match in YAML_KV_RE.finditer(yaml_content): key = kv_match.group(1) - value = kv_match.group(2) if kv_match.group(2) is not None else kv_match.group(3) + value = ( + kv_match.group(2) if kv_match.group(2) is not None else _parse_yaml_scalar_value(yaml_content, kv_match) + ) key_lower = key.lower() if key_lower == "name": diff --git a/python/packages/core/tests/core/test_skills.py b/python/packages/core/tests/core/test_skills.py index c386da2ff3..415d6ea857 100644 --- a/python/packages/core/tests/core/test_skills.py +++ b/python/packages/core/tests/core/test_skills.py @@ -319,9 +319,7 @@ def test_duplicate_directories_deduplicated(self, tmp_path: Path) -> None: refs = skill_dir / "references" refs.mkdir(parents=True) (refs / "doc.md").write_text("content", encoding="utf-8") - resources = FileSkillsSource._discover_resource_files( - str(skill_dir), directories=("references", "references") - ) + resources = FileSkillsSource._discover_resource_files(str(skill_dir), directories=("references", "references")) assert resources == ["references/doc.md"] def test_results_are_sorted(self, tmp_path: Path) -> None: @@ -1675,9 +1673,7 @@ def test_whitespace_only_raises(self) -> None: FileSkillsSource._validate_and_normalize_directory_names([" "]) def test_multiple_directories(self) -> None: - result = FileSkillsSource._validate_and_normalize_directory_names( - [".", "references", "assets", "scripts"] - ) + result = FileSkillsSource._validate_and_normalize_directory_names([".", "references", "assets", "scripts"]) assert result == [".", "references", "assets", "scripts"] def test_default_resource_directories(self) -> None: @@ -2163,6 +2159,145 @@ def test_description_exactly_max_length(self) -> None: assert result.description == desc +# --------------------------------------------------------------------------- +# Tests: _extract_frontmatter block scalar parsing +# --------------------------------------------------------------------------- + + +class TestExtractFrontmatterBlockScalars: + """Tests for YAML block scalar (| and >) parsing in _extract_frontmatter.""" + + def test_literal_block_scalar(self) -> None: + content = "---\nname: test-skill\ndescription: |\n Line one\n Line two\n---\nBody." + result = FileSkillsSource._extract_frontmatter(content, "test.md") + assert result is not None + assert result.description == "Line one\nLine two\n" + + def test_folded_block_scalar(self) -> None: + content = "---\nname: test-skill\ndescription: >\n This is a multi-line\n description block\n---\nBody." + result = FileSkillsSource._extract_frontmatter(content, "test.md") + assert result is not None + assert result.description == "This is a multi-line description block" + + def test_literal_strip_chomping(self) -> None: + content = "---\nname: test-skill\ndescription: |-\n No trailing newline\n---\nBody." + result = FileSkillsSource._extract_frontmatter(content, "test.md") + assert result is not None + assert result.description == "No trailing newline" + + def test_folded_strip_chomping(self) -> None: + content = "---\nname: test-skill\ndescription: >-\n Folded with\n strip chomping\n---\nBody." + result = FileSkillsSource._extract_frontmatter(content, "test.md") + assert result is not None + assert result.description == "Folded with strip chomping" + + def test_literal_keep_chomping(self) -> None: + content = "---\nname: test-skill\ndescription: |+\n Keep trailing\n---\nBody." + result = FileSkillsSource._extract_frontmatter(content, "test.md") + assert result is not None + assert result.description == "Keep trailing\n" + + def test_folded_keep_chomping(self) -> None: + content = "---\nname: test-skill\ndescription: >+\n Keep trailing\n newline\n---\nBody." + result = FileSkillsSource._extract_frontmatter(content, "test.md") + assert result is not None + assert result.description == "Keep trailing newline\n" + + def test_block_scalar_no_continuation_lines(self) -> None: + content = "---\nname: test-skill\ndescription: |\nlicense: MIT\n---\nBody." + result = FileSkillsSource._extract_frontmatter(content, "test.md") + # description becomes empty string which fails validation (empty/whitespace) + assert result is None + + def test_block_scalar_varying_indentation(self) -> None: + content = ( + "---\n" + "name: test-skill\n" + "description: |\n" + " Line with 4-space indent\n" + " Line with 4-space indent\n" + "---\n" + "Body." + ) + result = FileSkillsSource._extract_frontmatter(content, "test.md") + assert result is not None + assert result.description == "Line with 4-space indent\nLine with 4-space indent\n" + + def test_folded_block_scalar_real_skill_format(self) -> None: + """End-to-end test matching the format used in .github/skills/ SKILL.md files.""" + content = ( + "---\n" + "name: python-development\n" + "description: >\n" + " Coding standards, conventions, and patterns for developing Python code in the\n" + " Agent Framework repository. Use this when writing or modifying Python source\n" + " files in the python/ directory.\n" + "---\n" + "\n" + "# Python Development Standards\n" + ) + result = FileSkillsSource._extract_frontmatter(content, "test.md") + assert result is not None + assert result.description == ( + "Coding standards, conventions, and patterns for developing Python code in the " + "Agent Framework repository. Use this when writing or modifying Python source " + "files in the python/ directory." + ) + + def test_block_scalar_with_other_fields_after(self) -> None: + content = "---\nname: test-skill\ndescription: >\n A folded\n description\nlicense: MIT\n---\nBody." + result = FileSkillsSource._extract_frontmatter(content, "test.md") + assert result is not None + assert result.description == "A folded description" + assert result.license == "MIT" + + def test_plain_value_unchanged(self) -> None: + """Non-block-scalar values must not be affected by the block scalar logic.""" + content = "---\nname: test-skill\ndescription: A simple description.\n---\nBody." + result = FileSkillsSource._extract_frontmatter(content, "test.md") + assert result is not None + assert result.description == "A simple description." + + def test_block_scalar_content_with_colons(self) -> None: + """Lines inside a block scalar that look like YAML key-value pairs must be preserved verbatim.""" + content = ( + "---\nname: test-skill\ndescription: |\n Some text with colon: in it\n Another: line here\n---\nBody." + ) + result = FileSkillsSource._extract_frontmatter(content, "test.md") + assert result is not None + assert result.description == "Some text with colon: in it\nAnother: line here\n" + + def test_block_scalar_on_license_field(self) -> None: + """Block scalars should work on any field, not only description.""" + content = ( + "---\n" + "name: test-skill\n" + "description: A skill.\n" + "license: >\n" + " Custom license\n" + " spanning multiple lines\n" + "---\n" + "Body." + ) + result = FileSkillsSource._extract_frontmatter(content, "test.md") + assert result is not None + assert result.license == "Custom license spanning multiple lines" + + def test_block_scalar_tab_indentation(self) -> None: + """Tab characters should count as indentation for block scalar continuation lines.""" + content = "---\nname: test-skill\ndescription: |\n\tTab-indented line one\n\tTab-indented line two\n---\nBody." + result = FileSkillsSource._extract_frontmatter(content, "test.md") + assert result is not None + assert result.description == "Tab-indented line one\nTab-indented line two\n" + + def test_block_scalar_blank_line_within_block(self) -> None: + """Blank lines within a block scalar should be preserved as paragraph separators.""" + content = "---\nname: test-skill\ndescription: |\n First paragraph\n\n Second paragraph\n---\nBody." + result = FileSkillsSource._extract_frontmatter(content, "test.md") + assert result is not None + assert result.description == "First paragraph\n\nSecond paragraph\n" + + # --------------------------------------------------------------------------- # Tests: Skill spec fields (via SkillFrontmatter) # --------------------------------------------------------------------------- @@ -5498,9 +5633,7 @@ def my_runner(skill: Any, script: Any, args: Any = None) -> str: return "ok" assert isinstance(my_runner, SkillScriptRunner) - skill = FileSkill( - frontmatter=SkillFrontmatter(name="s", description="d"), content="c", path=f"{_ABS}/test" - ) + skill = FileSkill(frontmatter=SkillFrontmatter(name="s", description="d"), content="c", path=f"{_ABS}/test") script = FileSkillScript(name="run.py", full_path=f"{_ABS}/test/run.py") result = my_runner(skill, script, args=["--flag", "value"]) assert result == "ok"