Skip to content

Commit 1404a4d

Browse files
miss-islingtonrobsdedudeblurb-it[bot]bitdancer
authored
[3.14] gh-144156: Fix email header folding concatenating encoded words (GH-144692) (#145009)
gh-144156: Fix email header folding concatenating encoded words (GH-144692) The fix for gh-92081 (gh-92281) was unfortunately flawed, and broke whitespace handling for encoded word patterns that had previously been working correctly but had no corresponding tests, unfortunately in a way that made the resulting headers not RFC compliant, in such a way that Yahoo started rejecting the resulting emails. This fix was released in 3.14 alpha 1, 3.13 beta 2 and 3.12.5. This PR fixes the original problem in a way that does not break anything, and in fact fixes a small pre-existing bug (a spurious whitespace after the ':' of the header label if the header value is immediately wrapped on to the next line). (RDM) (cherry picked from commit 0f7cd55) Co-authored-by: Robsdedude <dev@rouvenbauer.de> Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com> Co-authored-by: R. David Murray <rdmurray@bitdance.com>
1 parent 24b5309 commit 1404a4d

File tree

5 files changed

+85
-39
lines changed

5 files changed

+85
-39
lines changed

Lib/email/_header_value_parser.py

Lines changed: 37 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,8 @@
8080
# Useful constants and functions
8181
#
8282

83-
WSP = set(' \t')
83+
_WSP = ' \t'
84+
WSP = set(_WSP)
8485
CFWS_LEADER = WSP | set('(')
8586
SPECIALS = set(r'()<>@,:;.\"[]')
8687
ATOM_ENDS = SPECIALS | WSP
@@ -2831,6 +2832,7 @@ def _steal_trailing_WSP_if_exists(lines):
28312832
lines.pop()
28322833
return wsp
28332834

2835+
28342836
def _refold_parse_tree(parse_tree, *, policy):
28352837
"""Return string of contents of parse_tree folded according to RFC rules.
28362838
@@ -2839,11 +2841,9 @@ def _refold_parse_tree(parse_tree, *, policy):
28392841
maxlen = policy.max_line_length or sys.maxsize
28402842
encoding = 'utf-8' if policy.utf8 else 'us-ascii'
28412843
lines = [''] # Folded lines to be output
2842-
leading_whitespace = '' # When we have whitespace between two encoded
2843-
# words, we may need to encode the whitespace
2844-
# at the beginning of the second word.
2845-
last_ew = None # Points to the last encoded character if there's an ew on
2846-
# the line
2844+
last_word_is_ew = False
2845+
last_ew = None # if there is an encoded word in the last line of lines,
2846+
# points to the encoded word's first character
28472847
last_charset = None
28482848
wrap_as_ew_blocked = 0
28492849
want_encoding = False # This is set to True if we need to encode this part
@@ -2878,6 +2878,7 @@ def _refold_parse_tree(parse_tree, *, policy):
28782878
if part.token_type == 'mime-parameters':
28792879
# Mime parameter folding (using RFC2231) is extra special.
28802880
_fold_mime_parameters(part, lines, maxlen, encoding)
2881+
last_word_is_ew = False
28812882
continue
28822883

28832884
if want_encoding and not wrap_as_ew_blocked:
@@ -2894,6 +2895,7 @@ def _refold_parse_tree(parse_tree, *, policy):
28942895
# XXX what if encoded_part has no leading FWS?
28952896
lines.append(newline)
28962897
lines[-1] += encoded_part
2898+
last_word_is_ew = False
28972899
continue
28982900
# Either this is not a major syntactic break, so we don't
28992901
# want it on a line by itself even if it fits, or it
@@ -2912,11 +2914,16 @@ def _refold_parse_tree(parse_tree, *, policy):
29122914
(last_charset == 'unknown-8bit' or
29132915
last_charset == 'utf-8' and charset != 'us-ascii')):
29142916
last_ew = None
2915-
last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew,
2916-
part.ew_combine_allowed, charset, leading_whitespace)
2917-
# This whitespace has been added to the lines in _fold_as_ew()
2918-
# so clear it now.
2919-
leading_whitespace = ''
2917+
last_ew = _fold_as_ew(
2918+
tstr,
2919+
lines,
2920+
maxlen,
2921+
last_ew,
2922+
part.ew_combine_allowed,
2923+
charset,
2924+
last_word_is_ew,
2925+
)
2926+
last_word_is_ew = True
29202927
last_charset = charset
29212928
want_encoding = False
29222929
continue
@@ -2929,28 +2936,19 @@ def _refold_parse_tree(parse_tree, *, policy):
29292936

29302937
if len(tstr) <= maxlen - len(lines[-1]):
29312938
lines[-1] += tstr
2939+
last_word_is_ew = last_word_is_ew and not bool(tstr.strip(_WSP))
29322940
continue
29332941

29342942
# This part is too long to fit. The RFC wants us to break at
29352943
# "major syntactic breaks", so unless we don't consider this
29362944
# to be one, check if it will fit on the next line by itself.
2937-
leading_whitespace = ''
29382945
if (part.syntactic_break and
29392946
len(tstr) + 1 <= maxlen):
29402947
newline = _steal_trailing_WSP_if_exists(lines)
29412948
if newline or part.startswith_fws():
2942-
# We're going to fold the data onto a new line here. Due to
2943-
# the way encoded strings handle continuation lines, we need to
2944-
# be prepared to encode any whitespace if the next line turns
2945-
# out to start with an encoded word.
29462949
lines.append(newline + tstr)
2947-
2948-
whitespace_accumulator = []
2949-
for char in lines[-1]:
2950-
if char not in WSP:
2951-
break
2952-
whitespace_accumulator.append(char)
2953-
leading_whitespace = ''.join(whitespace_accumulator)
2950+
last_word_is_ew = (last_word_is_ew
2951+
and not bool(lines[-1].strip(_WSP)))
29542952
last_ew = None
29552953
continue
29562954
if not hasattr(part, 'encode'):
@@ -2990,10 +2988,11 @@ def _refold_parse_tree(parse_tree, *, policy):
29902988
else:
29912989
# We can't fold it onto the next line either...
29922990
lines[-1] += tstr
2991+
last_word_is_ew = last_word_is_ew and not bool(tstr.strip(_WSP))
29932992

29942993
return policy.linesep.join(lines) + policy.linesep
29952994

2996-
def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, leading_whitespace):
2995+
def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, last_word_is_ew):
29972996
"""Fold string to_encode into lines as encoded word, combining if allowed.
29982997
Return the new value for last_ew, or None if ew_combine_allowed is False.
29992998
@@ -3008,6 +3007,16 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset,
30083007
to_encode = str(
30093008
get_unstructured(lines[-1][last_ew:] + to_encode))
30103009
lines[-1] = lines[-1][:last_ew]
3010+
elif last_word_is_ew:
3011+
# If we are following up an encoded word with another encoded word,
3012+
# any white space between the two will be ignored when decoded.
3013+
# Therefore, we encode all to-be-displayed whitespace in the second
3014+
# encoded word.
3015+
len_without_wsp = len(lines[-1].rstrip(_WSP))
3016+
leading_whitespace = lines[-1][len_without_wsp:]
3017+
lines[-1] = (lines[-1][:len_without_wsp]
3018+
+ (' ' if leading_whitespace else ''))
3019+
to_encode = leading_whitespace + to_encode
30113020
elif to_encode[0] in WSP:
30123021
# We're joining this to non-encoded text, so don't encode
30133022
# the leading blank.
@@ -3036,20 +3045,13 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset,
30363045

30373046
while to_encode:
30383047
remaining_space = maxlen - len(lines[-1])
3039-
text_space = remaining_space - chrome_len - len(leading_whitespace)
3048+
text_space = remaining_space - chrome_len
30403049
if text_space <= 0:
3041-
lines.append(' ')
3050+
newline = _steal_trailing_WSP_if_exists(lines)
3051+
lines.append(newline or ' ')
3052+
new_last_ew = len(lines[-1])
30423053
continue
30433054

3044-
# If we are at the start of a continuation line, prepend whitespace
3045-
# (we only want to do this when the line starts with an encoded word
3046-
# but if we're folding in this helper function, then we know that we
3047-
# are going to be writing out an encoded word.)
3048-
if len(lines) > 1 and len(lines[-1]) == 1 and leading_whitespace:
3049-
encoded_word = _ew.encode(leading_whitespace, charset=encode_as)
3050-
lines[-1] += encoded_word
3051-
leading_whitespace = ''
3052-
30533055
to_encode_word = to_encode[:text_space]
30543056
encoded_word = _ew.encode(to_encode_word, charset=encode_as)
30553057
excess = len(encoded_word) - remaining_space
@@ -3061,7 +3063,6 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset,
30613063
excess = len(encoded_word) - remaining_space
30623064
lines[-1] += encoded_word
30633065
to_encode = to_encode[len(to_encode_word):]
3064-
leading_whitespace = ''
30653066

30663067
if to_encode:
30673068
lines.append(' ')

Lib/test/test_email/test_generator.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,50 @@ def test_defaults_handle_spaces_at_start_of_continuation_line(self):
393393
g.flatten(msg)
394394
self.assertEqual(s.getvalue(), expected)
395395

396+
# gh-144156: fold between non-encoded and encoded words don't need to encoded
397+
# the separating space
398+
def test_defaults_handle_spaces_at_start_of_continuation_line_2(self):
399+
source = ("Re: [SOS-1495488] Commande et livraison - Demande de retour - "
400+
"bibijolie - 251210-AABBCC - Abo actualités digitales 20 semaines "
401+
"d’abonnement à 24 heures, Bilan, Tribune de Genève et tous les titres Tamedia")
402+
expected = (
403+
b"Subject: "
404+
b"Re: [SOS-1495488] Commande et livraison - Demande de retour -\n"
405+
b" bibijolie - 251210-AABBCC - Abo =?utf-8?q?actualit=C3=A9s?= digitales 20\n"
406+
b" semaines =?utf-8?q?d=E2=80=99abonnement_=C3=A0?= 24 heures, Bilan, Tribune de\n"
407+
b" =?utf-8?q?Gen=C3=A8ve?= et tous les titres Tamedia\n\n"
408+
)
409+
msg = EmailMessage()
410+
msg['Subject'] = source
411+
s = io.BytesIO()
412+
g = BytesGenerator(s)
413+
g.flatten(msg)
414+
self.assertEqual(s.getvalue(), expected)
415+
416+
def test_ew_folding_round_trip_1(self):
417+
print()
418+
source = "aaaaaaaaa фффффффф "
419+
msg = EmailMessage()
420+
msg['Subject'] = source
421+
s = io.BytesIO()
422+
g = BytesGenerator(s, maxheaderlen=30)
423+
g.flatten(msg)
424+
flat = s.getvalue()
425+
reparsed = message_from_bytes(flat, policy=policy.default)['Subject']
426+
self.assertMultiLineEqual(reparsed, source)
427+
428+
def test_ew_folding_round_trip_2(self):
429+
print()
430+
source = "aaa aaaaaaa aaa ффф фффф "
431+
msg = EmailMessage()
432+
msg['Subject'] = source
433+
s = io.BytesIO()
434+
g = BytesGenerator(s, maxheaderlen=30)
435+
g.flatten(msg)
436+
flat = s.getvalue()
437+
reparsed = message_from_bytes(flat, policy=policy.default)['Subject']
438+
self.assertMultiLineEqual(reparsed, source)
439+
396440
def test_cte_type_7bit_handles_unknown_8bit(self):
397441
source = ("Subject: Maintenant je vous présente mon "
398442
"collègue\n\n").encode('utf-8')

Lib/test/test_email/test_headerregistry.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1702,7 +1702,7 @@ def test_fold_unstructured_with_overlong_word(self):
17021702
'singlewordthatwontfit')
17031703
self.assertEqual(
17041704
h.fold(policy=policy.default.clone(max_line_length=20)),
1705-
'Subject: \n'
1705+
'Subject:\n'
17061706
' =?utf-8?q?thisisa?=\n'
17071707
' =?utf-8?q?verylon?=\n'
17081708
' =?utf-8?q?glineco?=\n'
@@ -1718,7 +1718,7 @@ def test_fold_unstructured_with_two_overlong_words(self):
17181718
'singlewordthatwontfit plusanotherverylongwordthatwontfit')
17191719
self.assertEqual(
17201720
h.fold(policy=policy.default.clone(max_line_length=20)),
1721-
'Subject: \n'
1721+
'Subject:\n'
17221722
' =?utf-8?q?thisisa?=\n'
17231723
' =?utf-8?q?verylon?=\n'
17241724
' =?utf-8?q?glineco?=\n'

Lib/test/test_email/test_policy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,7 @@ def test_non_ascii_chars_do_not_cause_inf_loop(self):
273273
actual = policy.fold('Subject', 'ą' * 12)
274274
self.assertEqual(
275275
actual,
276-
'Subject: \n' +
276+
'Subject:\n' +
277277
12 * ' =?utf-8?q?=C4=85?=\n')
278278

279279
def test_short_maxlen_error(self):
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix the folding of headers by the :mod:`email` library when :rfc:`2047` encoded words are used. Now whitespace is correctly preserved and also correctly added between adjacent encoded words. The latter property was broken by the fix for gh-92081, which mostly fixed previous failures to preserve whitespace.

0 commit comments

Comments
 (0)