-
Notifications
You must be signed in to change notification settings - Fork 4.5k
Python: Fix HTML-like tags being dropped in ChatHistory.from_rendered_prompt #13659
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -345,16 +345,45 @@ def from_rendered_prompt(cls: type[_T], rendered_prompt: str) -> _T: | |||||
| except ParseError as exc: | ||||||
| logger.info(f"Could not parse prompt {prompt} as xml, treating as text, error was: {exc}") | ||||||
| return cls(messages=[ChatMessageContent(role=AuthorRole.USER, content=unescape(prompt))]) | ||||||
| if xml_prompt.text and xml_prompt.text.strip(): | ||||||
| messages.append(ChatMessageContent(role=AuthorRole.SYSTEM, content=unescape(xml_prompt.text.strip()))) | ||||||
| # Accumulate text content that should be combined into a single message. | ||||||
| # This handles HTML-like tags (e.g., <p>, <div>) that are valid XML but not | ||||||
| # recognized as chat message tags — their content should be preserved as text. | ||||||
| pending_text_parts: list[str] = [] | ||||||
| if xml_prompt.text: | ||||||
| pending_text_parts.append(xml_prompt.text) | ||||||
|
|
||||||
| def flush_pending_text(role: AuthorRole = AuthorRole.SYSTEM) -> None: | ||||||
| """Flush accumulated text as a chat message if non-empty.""" | ||||||
| if pending_text_parts: | ||||||
| combined = "".join(pending_text_parts).strip() | ||||||
| if combined: | ||||||
| messages.append(ChatMessageContent(role=role, content=unescape(combined))) | ||||||
| pending_text_parts.clear() | ||||||
|
|
||||||
| for item in xml_prompt: | ||||||
| if item.tag == CHAT_MESSAGE_CONTENT_TAG: | ||||||
| # Flush any pending text before a structured message | ||||||
| flush_pending_text() | ||||||
| messages.append(ChatMessageContent.from_element(item)) | ||||||
| # Tail text after a recognized message element is treated as USER content | ||||||
| if item.tail and item.tail.strip(): | ||||||
| messages.append(ChatMessageContent(role=AuthorRole.USER, content=unescape(item.tail.strip()))) | ||||||
| elif item.tag == CHAT_HISTORY_TAG: | ||||||
| flush_pending_text() | ||||||
|
||||||
| flush_pending_text() | |
| flush_pending_text(role=AuthorRole.USER if messages else AuthorRole.SYSTEM) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -592,6 +592,48 @@ async def test_template_empty_history(chat_history: ChatHistory): | |
| assert chat_history_2.messages[1].role == AuthorRole.USER | ||
|
|
||
|
|
||
| def test_chat_history_from_rendered_prompt_with_html_tags(): | ||
| """Regression test for #13632: HTML tags in prompts caused content to be dropped. | ||
|
|
||
| When the prompt contains valid XML tags like <p>, <div>, etc., the XML parser | ||
| was treating them as elements and silently discarding their content because | ||
| they weren't recognized as chat message tags. The fix serializes unrecognized | ||
| elements back to their string representation. | ||
| """ | ||
| # Prompt with HTML-like tags | ||
| prompt_with_html = 'Translate this: "<p>What is your name?</p>"' | ||
| # Same prompt without HTML tags | ||
| prompt_without_html = 'Translate this: "What is your name?"' | ||
|
|
||
| history_with_html = ChatHistory.from_rendered_prompt(prompt_with_html) | ||
| history_without_html = ChatHistory.from_rendered_prompt(prompt_without_html) | ||
|
|
||
| # Both should produce a single user message | ||
| assert len(history_with_html.messages) == 1 | ||
| assert len(history_without_html.messages) == 1 | ||
|
|
||
| # Both should contain the question text | ||
| assert "What is your name?" in history_with_html.messages[0].content | ||
| assert "What is your name?" in history_without_html.messages[0].content | ||
|
|
||
| # The HTML version should preserve the <p> tags | ||
| assert "<p>" in history_with_html.messages[0].content | ||
| assert "</p>" in history_with_html.messages[0].content | ||
|
Comment on lines
+603
to
+621
|
||
|
|
||
|
|
||
| def test_chat_history_from_rendered_prompt_with_nested_html(): | ||
| """Test that nested HTML-like tags are preserved.""" | ||
| prompt = "Format this: <div><p>Hello</p><p>World</p></div>" | ||
|
|
||
| history = ChatHistory.from_rendered_prompt(prompt) | ||
|
|
||
| assert len(history.messages) == 1 | ||
| assert "Hello" in history.messages[0].content | ||
| assert "World" in history.messages[0].content | ||
| assert "<div>" in history.messages[0].content | ||
| assert "<p>" in history.messages[0].content | ||
|
|
||
|
|
||
| def test_to_from_file(chat_history: ChatHistory, tmp_path): | ||
| chat_history.add_system_message("You are an AI assistant") | ||
| chat_history.add_user_message("What is the weather in Seattle?") | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
flush_pending_text() defaults to SYSTEM, so when an unrecognized element (for example
) appears between already-parsed chat messages and a subsequent element, the preserved HTML gets emitted as a SYSTEM message. That is inconsistent with how inter-message text (item.tail) is treated as USER, and can unintentionally escalate user-provided content into the system role. Consider flushing pending text with role=USER when messages already exist (and only SYSTEM when no messages have been emitted yet).