From 8014ac2b14ef6220b6208d37f2604af13bdfed04 Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Sun, 6 Oct 2019 14:34:24 -0700 Subject: [PATCH] Print unicode as default in Python 2.x when CO_FUTURE_UNICODE_LITERALS is set. Fixes #141. --- ASTree.cpp | 30 ++++++++++++++++---------- bytecode.cpp | 9 +++++--- pyc_module.h | 5 +++++ pycdas.cpp | 9 +++++--- scripts/token_dump | 2 +- tests/compiled/unicode_future.2.6.pyc | Bin 0 -> 265 bytes tests/compiled/unicode_future.2.7.pyc | Bin 0 -> 265 bytes 7 files changed, 37 insertions(+), 18 deletions(-) create mode 100644 tests/compiled/unicode_future.2.6.pyc create mode 100644 tests/compiled/unicode_future.2.7.pyc diff --git a/ASTree.cpp b/ASTree.cpp index 7daf6bf..8da0f86 100644 --- a/ASTree.cpp +++ b/ASTree.cpp @@ -2777,17 +2777,25 @@ bool print_docstring(PycRef obj, int indent, PycModule* mod) { // docstrings are translated from the bytecode __doc__ = 'string' to simply '''string''' signed char prefix = -1; - if (obj.type() == PycObject::TYPE_STRING) - prefix = mod->majorVer() == 3 ? 'b' : 0; - else if (obj.type() == PycObject::TYPE_UNICODE) - prefix = mod->majorVer() == 3 ? 0 : 'u'; - else if (obj.type() == PycObject::TYPE_INTERNED || - obj.type() == PycObject::TYPE_STRINGREF || - obj.type() == PycObject::TYPE_ASCII || - obj.type() == PycObject::TYPE_ASCII_INTERNED || - obj.type() == PycObject::TYPE_SHORT_ASCII || - obj.type() == PycObject::TYPE_SHORT_ASCII_INTERNED) - prefix = 0; + switch (obj.type()) { + case PycObject::TYPE_STRING: + prefix = mod->strIsUnicode() ? 'b' : 0; + break; + case PycObject::TYPE_UNICODE: + prefix = mod->strIsUnicode() ? 0 : 'u'; + break; + case PycObject::TYPE_STRINGREF: + case PycObject::TYPE_INTERNED: + case PycObject::TYPE_ASCII: + case PycObject::TYPE_ASCII_INTERNED: + case PycObject::TYPE_SHORT_ASCII: + case PycObject::TYPE_SHORT_ASCII_INTERNED: + if (mod->majorVer() >= 3) + prefix = 0; + else + prefix = mod->strIsUnicode() ? 'b' : 0; + break; + } if (prefix != -1) { start_line(indent); OutputString(obj.cast(), prefix, true); diff --git a/bytecode.cpp b/bytecode.cpp index 1f5bf2c..fddb085 100644 --- a/bytecode.cpp +++ b/bytecode.cpp @@ -151,10 +151,10 @@ void print_const(PycRef obj, PycModule* mod) switch (obj->type()) { case PycObject::TYPE_STRING: - OutputString(obj.cast(), (mod->majorVer() == 3) ? 'b' : 0); + OutputString(obj.cast(), mod->strIsUnicode() ? 'b' : 0); break; case PycObject::TYPE_UNICODE: - OutputString(obj.cast(), (mod->majorVer() == 3) ? 0 : 'u'); + OutputString(obj.cast(), mod->strIsUnicode() ? 0 : 'u'); break; case PycObject::TYPE_STRINGREF: case PycObject::TYPE_INTERNED: @@ -162,7 +162,10 @@ void print_const(PycRef obj, PycModule* mod) case PycObject::TYPE_ASCII_INTERNED: case PycObject::TYPE_SHORT_ASCII: case PycObject::TYPE_SHORT_ASCII_INTERNED: - OutputString(obj.cast(), 0); + if (mod->majorVer() >= 3) + OutputString(obj.cast(), 0); + else + OutputString(obj.cast(), mod->strIsUnicode() ? 'b' : 0); break; case PycObject::TYPE_TUPLE: case PycObject::TYPE_SMALL_TUPLE: diff --git a/pyc_module.h b/pyc_module.h index 77e4734..1daf556 100644 --- a/pyc_module.h +++ b/pyc_module.h @@ -53,6 +53,11 @@ public: bool isUnicode() const { return m_unicode; } + bool strIsUnicode() const + { + return (m_maj >= 3) || (m_code->flags() & PycCode::CO_FUTURE_UNICODE_LITERALS) != 0; + } + PycRef code() const { return m_code; } void intern(PycRef str) { m_interns.push_back(str); } diff --git a/pycdas.cpp b/pycdas.cpp index f8096f0..f2db659 100644 --- a/pycdas.cpp +++ b/pycdas.cpp @@ -129,12 +129,12 @@ void output_object(PycRef obj, PycModule* mod, int indent) break; case PycObject::TYPE_STRING: iputs(indent, ""); - OutputString(obj.cast(), (mod->majorVer() == 3) ? 'b' : 0); + OutputString(obj.cast(), mod->strIsUnicode() ? 'b' : 0); fputs("\n", pyc_output); break; case PycObject::TYPE_UNICODE: iputs(indent, ""); - OutputString(obj.cast(), (mod->majorVer() == 3) ? 0 : 'u'); + OutputString(obj.cast(), mod->strIsUnicode() ? 0 : 'u'); fputs("\n", pyc_output); break; case PycObject::TYPE_STRINGREF: @@ -144,7 +144,10 @@ void output_object(PycRef obj, PycModule* mod, int indent) case PycObject::TYPE_SHORT_ASCII: case PycObject::TYPE_SHORT_ASCII_INTERNED: iputs(indent, ""); - OutputString(obj.cast(), 0); + if (mod->majorVer() >= 3) + OutputString(obj.cast(), 0); + else + OutputString(obj.cast(), mod->strIsUnicode() ? 'b' : 0); fputs("\n", pyc_output); break; case PycObject::TYPE_TUPLE: diff --git a/scripts/token_dump b/scripts/token_dump index 4e9b449..1207055 100755 --- a/scripts/token_dump +++ b/scripts/token_dump @@ -110,7 +110,7 @@ RE_WHITESPACE = re.compile(r'\s+') RE_WORD = re.compile(r'[A-Za-z_][A-Za-z0-9_]*') RE_INT = re.compile(r'[0-9][0-9_]*|0[Xx][0-9A-Fa-f_]+|0[Bb][0-1_]+|0[Oo][0-7_]+') RE_FLOAT = re.compile(r'(([0-9][0-9_]*)?\.[0-9][0-9_]*|[0-9][0-9_]*\.)([eE][+-]?[0-9][0-9_]*)?') -RE_START_STRING = re.compile(r'([rR][fFbB]?|[uU]|[fF][rR]?|[bB][rR]+)?(\'\'\'|\'|"""|")') +RE_START_STRING = re.compile(r'([rR][fFbB]?|[uU]|[fF][rR]?|[bB][rR]?)?(\'\'\'|\'|"""|")') # Note, tokens sharing a common prefix should be entered in order from # longest to shortest, so we don't mismatch a long token as a sequence diff --git a/tests/compiled/unicode_future.2.6.pyc b/tests/compiled/unicode_future.2.6.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e6cc3bde0ab0cb52702ed49b1c3ec7245996997a GIT binary patch literal 265 zcmYL@O$x#=5QQiGQwl=Y9zeQkFCZe~T2OIQkWiZxim@%tWYJ@|^;TZR3+TjJ9EO)~ z-h|;jU$gMOf1C^Wl}qm$9MB5_LCxR_pcrHfm~bF-K)8^(AUw!C5I!8^fWB65s+bXB zL}KcyXoTddLfPd?n+chx!9<^U)1w^Rtt@k+a2Hz}LlVwQgQk<5A7}#fNQ}nr3zrcD pL2UPb(Q_J;B(3V!U^-?PjwNl6Q(7-?jW9~CY*cNNM^3;3_5oUfIq?7h literal 0 HcmV?d00001 diff --git a/tests/compiled/unicode_future.2.7.pyc b/tests/compiled/unicode_future.2.7.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5d36764d0a2fe5c249197fff2e8b04daac8a7f70 GIT binary patch literal 265 zcmYL@O$x#=5QQh{k5Ul2_5jjVdjSy<*Mf?hf&^_+D5O6$lSPl=)@ynIomh**@bb-@ zFg)+Ih#tr1wSZr7dbhyAtOz7E0}W6!$QdxnA?F}H$UTrgFy8-94Ue~R(T5ViZ