gh-130273: Fix traceback color output with unicode characters#142529
gh-130273: Fix traceback color output with unicode characters#142529grayjk wants to merge 5 commits intopython:mainfrom
Conversation
|
@serhiy-storchaka: Here is a PR about text width and Unicode characters :-) |
|
updated to use @serhiy-storchaka's recently added unicodedata.iter_graphemes |
|
@pablogsal @hauntsaninja as recent reviewers of traceback.py, would you mind taking look |
|
There are conflicts, please fix them. |
|
@StanFromIreland conflicts resolved |
Lib/traceback.py
Outdated
| 2 if unicodedata.east_asian_width(char) in _WIDE_CHAR_SPECIFIERS else 1 | ||
| for char in line[:offset] | ||
| ) | ||
| from _pyrepl.utils import wlen |
There was a problem hiding this comment.
I would prefer to not depend on _pyrepl in the traceback module. I would prefer to move wlen() here, and modify _pyrepl.utils to get it from traceback.
There was a problem hiding this comment.
I've moved wlen/str_width in commit 467656e and made them private (prefixed with _) to avoid putting them in traceback.__all__ but mypy isn't happy about that. Should I make them public?
There was a problem hiding this comment.
are # type: ignore comments in this case okay?
There was a problem hiding this comment.
or alternatively I could move wlen to a new support file with a name prefixed with _
|
There are conflicts again I'm afraid, and mypy isn't happy either. |
| return 2 | ||
|
|
||
|
|
||
| ANSI_ESCAPE_SEQUENCE = re.compile(r"\x1b\[[ -@]*[A-~]") |
There was a problem hiding this comment.
It should also be private.
| import unicodedata | ||
| if ord(c) < 128: | ||
| return 1 |
There was a problem hiding this comment.
There is no need to import unicodedata for ASCII characters:
| import unicodedata | |
| if ord(c) < 128: | |
| return 1 | |
| if ord(c) < 128: | |
| return 1 | |
| import unicodedata |
| def _zip_display_width(line, carets): | ||
| import unicodedata | ||
| carets = iter(carets) | ||
| for char in unicodedata.iter_graphemes(line): | ||
| char = str(char) | ||
| char_width = _display_width(char) | ||
| yield char, "".join(itertools.islice(carets, char_width)) |
There was a problem hiding this comment.
Would it be possible to avoid the heavy unicodedata import for ASCII line?
| def _zip_display_width(line, carets): | |
| import unicodedata | |
| carets = iter(carets) | |
| for char in unicodedata.iter_graphemes(line): | |
| char = str(char) | |
| char_width = _display_width(char) | |
| yield char, "".join(itertools.islice(carets, char_width)) | |
| def _zip_display_width(line, carets): | |
| carets = iter(carets) | |
| if line.isascii(): | |
| for char in line: | |
| yield char, next(carets, "") | |
| else: | |
| import unicodedata | |
| for char in unicodedata.iter_graphemes(line): | |
| char = str(char) | |
| char_width = _display_width(char) | |
| yield char, "".join(itertools.islice(carets, char_width)) |
I'm not sure that my code is correct :-)
| @@ -0,0 +1 @@ | |||
| Fix traceback color output with unicode characters | |||
There was a problem hiding this comment.
| Fix traceback color output with unicode characters | |
| Fix traceback color output with Unicode characters. |
Claude Fleet — Thorough Review{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_01ND5n5aLjRPCKzQVQMgfjCM","type":"tool_result","content":"diff --git a/Lib/_pyrepl/utils.py b/Lib/_pyrepl/utils.py\nindex 25d7ac1bd0b..d79c451949f 100644\n--- a/Lib/_pyrepl/utils.py\n+++ b/Lib/_pyrepl/utils.py\n@@ -1,6 +1,5 @@\n from future import annotations\n import builtins\n-import functools\n import keyword\n import re\n import token as T\n@@ -11,12 +10,12 @@\n from collections import deque\n from io import StringIO\n from tokenize import TokenInfo as TI\n+from traceback import _str_width as str_width, _wlen as wlen\n from typing import Iterable, Iterator, Match, NamedTuple, Self\n \n from .types import CharBuffer, CharWidths\n from .trace import trace\n \n-ANSI_ESCAPE_SEQUENCE = re.compile(r"\x1b\[[ -@][A-~]")\n ZERO_WIDTH_BRACKET = re.compile(r"\x01.?\x02")\n ZERO_WIDTH_TRANS = str.maketrans({"\x01": "", "\x02": ""})\n IDENTIFIERS_AFTER = {"def", "class"}\n@@ -59,32 +58,6 @@ class ColorSpan(NamedTuple):\n tag: str\n \n \n-@functools.cache\n-def str_width(c: str) -> int:\n- if ord(c) < 128:\n- return 1\n- # gh-139246 for zero-width joiner and combining characters\n- if unicodedata.combining(c):\n- return 0\n- category = unicodedata.category(c)\n- if category == "Cf" and c != "\u00ad":\n- return 0\n- w = unicodedata.east_asian_width(c)\n- if w in ("N", "Na", "H", "A"):\n- return 1\n- return 2\n-\n-\n-def wlen(s: str) -> int:\n- if len(s) == 1 and s != "\x1a":\n- return str_width(s)\n- length = sum(str_width(i) for i in s)\n- # remove lengths of any escape sequences\n- sequence = ANSI_ESCAPE_SEQUENCE.findall(s)\n- ctrl_z_cnt = s.count("\x1a")\n- return length - sum(len(i) for i in sequence) + ctrl_z_cnt\n-\n-\n def unbracket(s: str, including_content: bool = False) -> str:\n r"""Return (truncated) |
Claude Fleet — Thorough Review{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_01BVnZXRJ7NosSSQTzvjJ2qX","type":"tool_result","content":"diff --git a/Lib/_pyrepl/utils.py b/Lib/_pyrepl/utils.py\nindex 25d7ac1bd0b..d79c451949f 100644\n--- a/Lib/_pyrepl/utils.py\n+++ b/Lib/_pyrepl/utils.py\n@@ -1,6 +1,5 @@\n from future import annotations\n import builtins\n-import functools\n import keyword\n import re\n import token as T\n@@ -11,12 +10,12 @@\n from collections import deque\n from io import StringIO\n from tokenize import TokenInfo as TI\n+from traceback import _str_width as str_width, _wlen as wlen\n from typing import Iterable, Iterator, Match, NamedTuple, Self\n \n from .types import CharBuffer, CharWidths\n from .trace import trace\n \n-ANSI_ESCAPE_SEQUENCE = re.compile(r"\x1b\[[ -@][A-~]")\n ZERO_WIDTH_BRACKET = re.compile(r"\x01.?\x02")\n ZERO_WIDTH_TRANS = str.maketrans({"\x01": "", "\x02": ""})\n IDENTIFIERS_AFTER = {"def", "class"}\n@@ -59,32 +58,6 @@ class ColorSpan(NamedTuple):\n tag: str\n \n \n-@functools.cache\n-def str_width(c: str) -> int:\n- if ord(c) < 128:\n- return 1\n- # gh-139246 for zero-width joiner and combining characters\n- if unicodedata.combining(c):\n- return 0\n- category = unicodedata.category(c)\n- if category == "Cf" and c != "\u00ad":\n- return 0\n- w = unicodedata.east_asian_width(c)\n- if w in ("N", "Na", "H", "A"):\n- return 1\n- return 2\n-\n-\n-def wlen(s: str) -> int:\n- if len(s) == 1 and s != "\x1a":\n- return str_width(s)\n- length = sum(str_width(i) for i in s)\n- # remove lengths of any escape sequences\n- sequence = ANSI_ESCAPE_SEQUENCE.findall(s)\n- ctrl_z_cnt = s.count("\x1a")\n- return length - sum(len(i) for i in sequence) + ctrl_z_cnt\n-\n-\n def unbracket(s: str, including_content: bool = False) -> str:\n r"""Return (truncated) |
|
@pablogsal You maybe make wrong something :) |
Account for the display width of unicode characters so that colors and underlining in traceback output is correct
Closes #130273