Skip to content

Commit af3c912

Browse files
miss-islingtonserhiy-storchakaclaude
authored
[3.14] gh-152415: Exercise curses non-ASCII tests under 8-bit locale encodings (GH-152416) (GH-152453) (GH-152456)
The non-ASCII tests only exercised what the runner's locale could encode (in practice UTF-8). Add 8-bit-encoding cases to the character and string I/O tests, each guarded by the existing encodability check: ASCII, a character common to the Latin encodings ('é'), and ones distinctive to a single encoding (byte 0xA4 is '¤' in ISO-8859-1, '€' in ISO-8859-15, 'є' in KOI8-U). Run the whole suite under different locales to cover them; unrepresentable cases skip. * gh-152415: Verify character output round-trips in test_output_character Read each written character back with in_wch() or instr() rather than inch(), which on a wide build returns the low byte of the code point instead of the locale-encoded byte and so mangles a non-ASCII character of an 8-bit locale. This lets the int-argument cases cover '€'/'є', and adds matching coverage for the str argument. insch() with an int byte > 127 is checked only for Latin-1: on a wide build ncurses winsch stores a printable byte directly as a code point instead of decoding it through the locale. (cherry picked from commit 003d362) (cherry picked from commit a75aa41) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com> Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
1 parent 933d3ce commit af3c912

1 file changed

Lines changed: 242 additions & 27 deletions

File tree

Lib/test/test_curses.py

Lines changed: 242 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,33 @@ def test_refresh_control(self):
249249
self.assertIs(win.is_wintouched(), syncok)
250250
self.assertIs(stdscr.is_wintouched(), syncok)
251251

252+
# Many tests below use a common set of non-ASCII cases, each applied only
253+
# when the window encoding can represent it -- so the whole suite is meant to
254+
# be run under several locales (e.g. ISO-8859-1, ISO-8859-15, KOI8-U):
255+
# 'A'/'a' ASCII
256+
# 'é' common to the Latin encodings
257+
# '¤'/'€'/'є' byte 0xA4 in ISO-8859-1 / ISO-8859-15 / KOI8-U
258+
# Precomposed characters are used so a round-trip does not depend on the form.
259+
260+
def _encodable(self, s):
261+
# Wide characters are only supported in a locale that can encode them.
262+
try:
263+
s.encode(self.stdscr.encoding)
264+
except UnicodeEncodeError:
265+
return False
266+
return True
267+
268+
def _read_char(self, y, x):
269+
# The character written to a cell, read back for output checks. inch()
270+
# is unusable here: on a wide build it returns the low 8 bits of the
271+
# character's code point rather than its locale-encoded byte, mangling
272+
# anything outside Latin-1. in_wch() reads the wide cell directly;
273+
# without it, instr() re-encodes the cell to the window encoding.
274+
stdscr = self.stdscr
275+
if hasattr(stdscr, 'in_wch'):
276+
return str(stdscr.in_wch(y, x))
277+
return stdscr.instr(y, x, 1).decode(stdscr.encoding)
278+
252279
def test_output_character(self):
253280
stdscr = self.stdscr
254281
encoding = stdscr.encoding
@@ -258,32 +285,98 @@ def test_output_character(self):
258285
stdscr.addch('A')
259286
stdscr.addch(b'A')
260287
stdscr.addch(65)
261-
c = '\u20ac'
262-
try:
263-
stdscr.addch(c)
264-
except UnicodeEncodeError:
265-
self.assertRaises(UnicodeEncodeError, c.encode, encoding)
266-
except OverflowError:
267-
encoded = c.encode(encoding)
268-
self.assertNotEqual(len(encoded), 1, repr(encoded))
288+
# See _encodable for the character set. Each is either written (mapped
289+
# to a single byte), or raises UnicodeEncodeError (not in the encoding)
290+
# or OverflowError (a multibyte sequence, e.g. in UTF-8).
291+
for c in ('A', '\u00e9', '\u00a4', '\u20ac', '\u0454'):
292+
try:
293+
stdscr.addch(c)
294+
except UnicodeEncodeError:
295+
self.assertRaises(UnicodeEncodeError, c.encode, encoding)
296+
except OverflowError:
297+
encoded = c.encode(encoding)
298+
self.assertNotEqual(len(encoded), 1, repr(encoded))
269299
stdscr.addch('A', curses.A_BOLD)
270300
stdscr.addch(1, 2, 'A')
271301
stdscr.addch(2, 3, 'A', curses.A_BOLD)
272302
self.assertIs(stdscr.is_wintouched(), True)
273303

304+
# The same characters supplied as an int chtype (a byte > 127). The
305+
# cell is read back with _read_char(), not inch(): on a wide build the
306+
# int is stored through the locale as a wide character that inch()
307+
# cannot represent for a character outside Latin-1.
308+
for c in ('é', '¤', '€', 'є'):
309+
try:
310+
b = c.encode(encoding)
311+
except UnicodeEncodeError:
312+
continue
313+
if len(b) != 1:
314+
continue
315+
# A wide build stores a character outside Latin-1 as a wide cell,
316+
# not as its encoded byte, so it cannot round-trip here.
317+
if ord(c) > 0xff and hasattr(stdscr, 'get_wch'):
318+
continue
319+
v = b[0]
320+
with self.subTest(c=c):
321+
stdscr.addch(0, 0, v)
322+
self.assertEqual(self._read_char(0, 0), c)
323+
stdscr.addch(0, 1, v, curses.A_BOLD)
324+
self.assertEqual(self._read_char(0, 1), c)
325+
self.assertTrue(stdscr.inch(0, 1) & curses.A_BOLD)
326+
stdscr.move(2, 0)
327+
stdscr.echochar(v)
328+
self.assertEqual(self._read_char(2, 0), c)
329+
# insch() round-trips a byte only where its code point equals
330+
# the byte value (Latin-1): on a wide build ncurses winsch
331+
# stores a printable byte directly as a code point instead of
332+
# decoding it through the locale.
333+
if ord(c) < 0x100:
334+
stdscr.insch(1, 0, v)
335+
self.assertEqual(self._read_char(1, 0), c)
336+
337+
# The same characters supplied as a str. Unlike the int path above, a
338+
# str is stored as a wide-character cell on a wide build, so every
339+
# encodable character round-trips, insch() included. A multibyte
340+
# character does not fit a cell on a narrow build and is skipped.
341+
wide = hasattr(stdscr, 'in_wch')
342+
for c in ('é', '¤', '€', 'є'):
343+
if not self._encodable(c):
344+
continue
345+
if not wide and len(c.encode(encoding)) != 1:
346+
continue
347+
# A wide build stores a character outside Latin-1 as a wide cell,
348+
# not as its encoded byte, so it cannot round-trip here.
349+
if ord(c) > 0xff and hasattr(stdscr, 'get_wch'):
350+
continue
351+
with self.subTest(c=c):
352+
stdscr.addch(0, 0, c)
353+
self.assertEqual(self._read_char(0, 0), c)
354+
stdscr.addch(0, 1, c, curses.A_BOLD)
355+
self.assertEqual(self._read_char(0, 1), c)
356+
self.assertTrue(stdscr.inch(0, 1) & curses.A_BOLD)
357+
stdscr.insch(1, 0, c)
358+
self.assertEqual(self._read_char(1, 0), c)
359+
stdscr.move(2, 0)
360+
stdscr.echochar(c)
361+
self.assertEqual(self._read_char(2, 0), c)
362+
274363
# echochar()
275364
stdscr.refresh()
276365
stdscr.move(0, 0)
277366
stdscr.echochar('A')
278367
stdscr.echochar(b'A')
279368
stdscr.echochar(65)
280-
with self.assertRaises((UnicodeEncodeError, OverflowError)):
281-
# Unicode is not fully supported yet, but at least it does
282-
# not crash.
283-
# It is supposed to fail because either the character is
284-
# not encodable with the current encoding, or it is encoded to
285-
# a multibyte sequence.
286-
stdscr.echochar('\u0114')
369+
# See _encodable for the character set; as in the addch() loop above.
370+
for c in ('A', '\u00e9', '\u00a4', '\u20ac', '\u0454'):
371+
try:
372+
stdscr.echochar(c)
373+
except UnicodeEncodeError:
374+
# The character is not encodable with the current encoding.
375+
self.assertRaises(UnicodeEncodeError, c.encode, encoding)
376+
except OverflowError:
377+
# The character is encoded to a multibyte sequence.
378+
encoded = c.encode(encoding)
379+
self.assertNotEqual(len(encoded), 1, repr(encoded))
287380
stdscr.echochar('A', curses.A_BOLD)
288381
self.assertIs(stdscr.is_wintouched(), False)
289382

@@ -293,14 +386,18 @@ def test_output_string(self):
293386
# addstr()/insstr()
294387
for func in [stdscr.addstr, stdscr.insstr]:
295388
with self.subTest(func.__qualname__):
296-
stdscr.move(0, 0)
297389
func('abcd')
298390
func(b'abcd')
299-
s = 'àßçđ'
300-
try:
301-
func(s)
302-
except UnicodeEncodeError:
303-
self.assertRaises(UnicodeEncodeError, s.encode, encoding)
391+
# Common and encoding-distinctive strings (see _encodable for the
392+
# 0xA4 set); 'àßçđ' is UTF-8-only. Each is written if the
393+
# encoding allows, else raises UnicodeEncodeError.
394+
for s in ('soupçon', 'àßçđ', 'soupçon ¤', 'soupçon €', 'дякую'):
395+
stdscr.move(0, 0)
396+
try:
397+
func(s)
398+
except UnicodeEncodeError:
399+
self.assertRaises(UnicodeEncodeError, s.encode, encoding)
400+
stdscr.move(0, 0)
304401
func('abcd', curses.A_BOLD)
305402
func(1, 2, 'abcd')
306403
func(2, 3, 'abcd', curses.A_BOLD)
@@ -311,11 +408,14 @@ def test_output_string(self):
311408
stdscr.move(0, 0)
312409
func('1234', 3)
313410
func(b'1234', 3)
314-
s = '\u0661\u0662\u0663\u0664'
315-
try:
316-
func(s, 3)
317-
except UnicodeEncodeError:
318-
self.assertRaises(UnicodeEncodeError, s.encode, encoding)
411+
# As above (see _encodable); Arabic-Indic digits are UTF-8-only.
412+
for s in ('caf\u00e9', '\u0661\u0662\u0663\u0664', 'caf\u00e9 \u00a4', 'caf\u00e9 \u20ac', '\u0434\u044f\u043a\u0443\u044e'):
413+
stdscr.move(0, 0)
414+
try:
415+
func(s, 3)
416+
except UnicodeEncodeError:
417+
self.assertRaises(UnicodeEncodeError, s.encode, encoding)
418+
stdscr.move(0, 0)
319419
func('1234', 5)
320420
func('1234', 3, curses.A_BOLD)
321421
func(1, 2, '1234', 3)
@@ -405,6 +505,24 @@ def test_read_from_window(self):
405505
self.assertEqual(stdscr.instr(0, 2, 4), b'BCD ')
406506
self.assertRaises(ValueError, stdscr.instr, -2)
407507
self.assertRaises(ValueError, stdscr.instr, 0, 2, -2)
508+
# A non-ASCII character of an 8-bit locale reads back as its encoded
509+
# byte (see _encodable for the set). instr() returns the locale bytes
510+
# for any single-byte character; inch() packs the text into a chtype, so
511+
# on a wide build it only round-trips a Latin-1 codepoint (byte ==
512+
# codepoint).
513+
encoding = stdscr.encoding
514+
for ch in ('A', 'é', '¤', '€', 'є'):
515+
try:
516+
b = ch.encode(encoding)
517+
except UnicodeEncodeError:
518+
continue
519+
if len(b) != 1:
520+
continue
521+
with self.subTest(ch=ch):
522+
stdscr.addstr(2, 0, ch)
523+
self.assertEqual(stdscr.instr(2, 0, 1), b)
524+
if ord(ch) < 0x100:
525+
self.assertEqual(stdscr.inch(2, 0) & curses.A_CHARTEXT, b[0])
408526

409527
def test_coordinate_errors(self):
410528
# Addressing a cell outside the window raises curses.error.
@@ -441,6 +559,10 @@ def test_getch(self):
441559
self.assertEqual(win.getch(), b'm'[0])
442560
self.assertEqual(win.getch(), b'\n'[0])
443561

562+
# A key value > 127 is delivered unchanged (it is not locale text).
563+
curses.ungetch(0xE9)
564+
self.assertEqual(win.getch(), 0xE9)
565+
444566
def test_getstr(self):
445567
win = curses.newwin(5, 12, 5, 2)
446568
curses.echo()
@@ -613,6 +735,33 @@ def test_background(self):
613735
self.assertEqual(win.inch(0, 0), b'L'[0] | curses.A_REVERSE)
614736
self.assertEqual(win.inch(0, 5), b'#'[0] | curses.A_REVERSE)
615737

738+
# A non-ASCII background character of an 8-bit locale reads back as its
739+
# encoded byte. See _encodable for the character set.
740+
win.bkgd(' ')
741+
encoding = win.encoding
742+
for ch in ('é', '¤', '€', 'є'):
743+
try:
744+
b = ch.encode(encoding)
745+
except UnicodeEncodeError:
746+
continue
747+
if len(b) != 1:
748+
continue
749+
# A wide build stores a character outside Latin-1 as a wide cell,
750+
# not as its encoded byte, so it cannot round-trip here.
751+
if ord(ch) > 0xff and hasattr(win, 'get_wch'):
752+
continue
753+
with self.subTest(ch=ch):
754+
win.bkgd(ch)
755+
self.assertEqual(win.getbkgd(), b[0])
756+
if ord(ch) < 0x100:
757+
# The same byte given as an int. A wide build stores it
758+
# through the locale, so only a Latin-1 byte round-trips.
759+
win.bkgd(' ')
760+
win.bkgdset(b[0])
761+
self.assertEqual(win.getbkgd(), b[0])
762+
win.bkgd(b[0])
763+
self.assertEqual(win.getbkgd(), b[0])
764+
616765
def test_overlay(self):
617766
srcwin = curses.newwin(5, 18, 3, 4)
618767
lorem_ipsum(srcwin)
@@ -705,6 +854,16 @@ def test_borders_and_lines(self):
705854
win.border(65, 66)
706855
win.border(65)
707856
win.border()
857+
# With no arguments, border() fills the edges with ACS line and corner
858+
# characters.
859+
chartext = curses.A_CHARTEXT
860+
maxy, maxx = win.getmaxyx()
861+
self.assertEqual(win.inch(0, 0) & chartext, curses.ACS_ULCORNER & chartext)
862+
self.assertEqual(win.inch(0, maxx-1) & chartext, curses.ACS_URCORNER & chartext)
863+
self.assertEqual(win.inch(maxy-1, 0) & chartext, curses.ACS_LLCORNER & chartext)
864+
self.assertEqual(win.inch(maxy-1, maxx-1) & chartext, curses.ACS_LRCORNER & chartext)
865+
self.assertEqual(win.inch(0, 1) & chartext, curses.ACS_HLINE & chartext)
866+
self.assertEqual(win.inch(1, 0) & chartext, curses.ACS_VLINE & chartext)
708867

709868
win.box(':', '~')
710869
self.assertEqual(win.instr(0, 1, 8), b'~~~~~~~~')
@@ -715,6 +874,11 @@ def test_borders_and_lines(self):
715874
self.assertRaises(TypeError, win.box, 65, 66, 67)
716875
self.assertRaises(TypeError, win.box, 65)
717876
win.box()
877+
# With no arguments, box() likewise draws ACS corners and lines.
878+
self.assertEqual(win.inch(0, 0) & chartext, curses.ACS_ULCORNER & chartext)
879+
self.assertEqual(win.inch(0, maxx-1) & chartext, curses.ACS_URCORNER & chartext)
880+
self.assertEqual(win.inch(0, 1) & chartext, curses.ACS_HLINE & chartext)
881+
self.assertEqual(win.inch(1, 0) & chartext, curses.ACS_VLINE & chartext)
718882

719883
win.move(1, 2)
720884
win.hline('-', 5)
@@ -736,6 +900,43 @@ def test_borders_and_lines(self):
736900
self.assertEqual(win.inch(2, 1), b';'[0] | curses.A_STANDOUT)
737901
self.assertEqual(win.inch(3, 1), b'a'[0])
738902

903+
# A border or line character of an 8-bit locale round-trips as its
904+
# encoded byte. See _encodable for the character set.
905+
encoding = win.encoding
906+
for ch in ('é', '¤', '€', 'є'):
907+
try:
908+
b = ch.encode(encoding)
909+
except UnicodeEncodeError:
910+
continue
911+
if len(b) != 1:
912+
continue
913+
# A wide build stores a character outside Latin-1 as a wide cell,
914+
# not as its encoded byte, so it cannot round-trip here.
915+
if ord(ch) > 0xff and hasattr(win, 'get_wch'):
916+
continue
917+
with self.subTest(ch=ch):
918+
win.erase()
919+
win.hline(2, 0, ch, 5)
920+
self.assertEqual(win.instr(2, 0, 5), b * 5)
921+
win.vline(0, 0, ch, 3)
922+
self.assertEqual(win.instr(0, 0, 1), b)
923+
self.assertEqual(win.instr(1, 0, 1), b)
924+
win.border(ch, ch, ch, ch, ch, ch, ch, ch)
925+
self.assertEqual(win.instr(0, 0), b * maxx)
926+
if ord(ch) < 0x100:
927+
# The same byte given as an int. A wide build stores it
928+
# through the locale, so only a Latin-1 byte round-trips.
929+
v = b[0]
930+
win.erase()
931+
win.hline(2, 0, v, 5)
932+
self.assertEqual(win.instr(2, 0, 5), b * 5)
933+
win.vline(0, 0, v, 3)
934+
self.assertEqual(win.instr(1, 0, 1), b)
935+
win.border(v, v, v, v, v, v, v, v)
936+
self.assertEqual(win.instr(0, 0), b * maxx)
937+
win.box(v, v)
938+
self.assertEqual(win.instr(0, 1, 1), b)
939+
739940
def test_unctrl(self):
740941
# TODO: wunctrl()
741942
self.assertEqual(curses.unctrl(b'A'), b'A')
@@ -744,6 +945,19 @@ def test_unctrl(self):
744945
self.assertEqual(curses.unctrl(b'\n'), b'^J')
745946
self.assertEqual(curses.unctrl('\n'), b'^J')
746947
self.assertEqual(curses.unctrl(10), b'^J')
948+
# A printable non-ASCII byte of an 8-bit locale is returned unchanged.
949+
# See _encodable for the character set.
950+
encoding = self.stdscr.encoding
951+
for ch in ('é', '¤', '€', 'є'):
952+
try:
953+
b = ch.encode(encoding)
954+
except UnicodeEncodeError:
955+
continue
956+
if len(b) != 1:
957+
continue
958+
with self.subTest(ch=ch):
959+
self.assertEqual(curses.unctrl(ch), b)
960+
self.assertEqual(curses.unctrl(b[0]), b) # the byte as an int
747961
self.assertRaises(TypeError, curses.unctrl, b'')
748962
self.assertRaises(TypeError, curses.unctrl, b'AB')
749963
self.assertRaises(TypeError, curses.unctrl, '')
@@ -1455,7 +1669,8 @@ def test_issue6243(self):
14551669
def test_unget_wch(self):
14561670
stdscr = self.stdscr
14571671
encoding = stdscr.encoding
1458-
for ch in ('a', '\xe9', '\u20ac', '\U0010FFFF'):
1672+
# See _encodable for the character set, plus a non-BMP character.
1673+
for ch in ('a', '\xe9', '\xa4', '\u20ac', '\u0454', '\U0010FFFF'):
14591674
try:
14601675
ch.encode(encoding)
14611676
except UnicodeEncodeError:

0 commit comments

Comments
 (0)