From e8f1af7c168932e935496b6874023c1ae4ba6a73 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Fri, 24 Apr 2026 18:51:24 +0200 Subject: [PATCH] fix: preserve .tail text when removing empty elements (#1938) remove_empty_elements_fast() was dropping trailing text attached to elements via lxml .tail when removing empty elements. Now appends the tail to the previous sibling or parent before removal. --- crawl4ai/content_scraping_strategy.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 9853f788f..9cdc53cfd 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -562,6 +562,14 @@ def remove_empty_elements_fast(self, root, word_count_threshold=5): ): parent = el.getparent() if parent is not None: + # Preserve .tail text before removing the element + tail = el.tail + if tail: + prev = el.getprevious() + if prev is not None: + prev.tail = (prev.tail or "") + tail + else: + parent.text = (parent.text or "") + tail parent.remove(el) return root