From 0c9fbd9caf79ca7f397cf6309e482826aeafd843 Mon Sep 17 00:00:00 2001 From: Aralox Date: Sat, 17 Oct 2020 20:52:57 +1100 Subject: [PATCH] Issue-165 Added support for f-strings (literal string interpolation https://www.python.org/dev/peps/pep-0498/) Opcodes handled: FORMAT_VALUE, BUILD_STRING. Added AST node classes for FormattedValue and JoinedStr. --- .gitignore | 1 + ASTNode.cpp | 8 +++ ASTNode.h | 44 +++++++++++++++ ASTree.cpp | 92 ++++++++++++++++++++++++++++++++ bytecode.cpp | 10 ++-- bytecode.h | 2 +- pyc_string.cpp | 46 ++++++++++------ pyc_string.h | 4 +- tests/compiled/f-string.3.7.pyc | Bin 0 -> 1608 bytes tests/input/f-string.py | 42 +++++++++++++++ tests/tokenized/f-string.txt | 32 +++++++++++ 11 files changed, 257 insertions(+), 24 deletions(-) create mode 100644 tests/compiled/f-string.3.7.pyc create mode 100644 tests/input/f-string.py create mode 100644 tests/tokenized/f-string.txt diff --git a/.gitignore b/.gitignore index 2d388be..6237dda 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ *.gcda *.kdev4 /.kdev4 +__pycache__ diff --git a/ASTNode.cpp b/ASTNode.cpp index d375b65..7e3879e 100644 --- a/ASTNode.cpp +++ b/ASTNode.cpp @@ -79,3 +79,11 @@ const char* ASTBlock::type_str() const }; return s_type_strings[blktype()]; } +/* ASTFormattedValue */ +// This must be a triple quote (''' or """), to handle interpolated string literals containing the opposite quote style. +// E.g. f'''{"interpolated "123' literal"}''' -> valid. +// E.g. f"""{"interpolated "123' literal"}""" -> valid. +// E.g. f'{"interpolated "123' literal"}' -> invalid, unescaped quotes in literal. +// E.g. f'{"interpolated \"123\' literal"}' -> invalid, f-string expression does not allow backslash. +// NOTE: Nested f-strings not supported. +const char* ASTFormattedValue::F_STRING_QUOTE = "'''"; diff --git a/ASTNode.h b/ASTNode.h index 728c7f4..71dad64 100644 --- a/ASTNode.h +++ b/ASTNode.h @@ -15,6 +15,7 @@ public: NODE_TUPLE, NODE_LIST, NODE_MAP, NODE_SUBSCR, NODE_PRINT, NODE_CONVERT, NODE_KEYWORD, NODE_RAISE, NODE_EXEC, NODE_BLOCK, NODE_COMPREHENSION, NODE_LOADBUILDCLASS, NODE_AWAITABLE, + NODE_FORMATTEDVALUE, NODE_JOINEDSTR, // Empty node types NODE_LOCALS, @@ -611,4 +612,47 @@ private: PycRef m_expr; }; +class ASTFormattedValue : public ASTNode { +public: + static const char* F_STRING_QUOTE; + enum ConversionFlag { + NONE=0, + STR=1, + REPR=2, + ASCII=3, + FMTSPEC=4 + }; + + ASTFormattedValue(PycRef val, ConversionFlag conversion, PycRef format_spec) + : ASTNode(NODE_FORMATTEDVALUE), + m_val(std::move(val)), + m_conversion(conversion), + m_format_spec(std::move(format_spec)) + {} + + PycRef val() const { return m_val; } + ConversionFlag conversion() const { return m_conversion; } + PycRef format_spec() const { return m_format_spec; } + +private: + PycRef m_val; + ConversionFlag m_conversion; + PycRef m_format_spec; +}; + +// Same as ASTList +class ASTJoinedStr : public ASTNode { +public: + typedef std::list> value_t; + + ASTJoinedStr(value_t values) + : ASTNode(NODE_JOINEDSTR), m_values(std::move(values)) { } + + const value_t& values() const { return m_values; } + +private: + value_t m_values; +}; + + #endif diff --git a/ASTree.cpp b/ASTree.cpp index 277c1b9..ed252a9 100644 --- a/ASTree.cpp +++ b/ASTree.cpp @@ -366,6 +366,17 @@ PycRef BuildFromCode(PycRef code, PycModule* mod) } } break; + case Pyc::BUILD_STRING_A: + { + // Nearly identical logic to BUILD_LIST + ASTList::value_t values; + for (int i = 0; i < operand; i++) { + values.push_front(stack.top()); + stack.pop(); + } + stack.push(new ASTJoinedStr(values)); + } + break; case Pyc::BUILD_TUPLE_A: { ASTTuple::value_t values; @@ -782,6 +793,35 @@ PycRef BuildFromCode(PycRef code, PycModule* mod) stack.push(NULL); // We can totally hack this >_> } break; + case Pyc::FORMAT_VALUE_A: + { + auto conversion_flag = static_cast(operand); + switch (conversion_flag) + { + case ASTFormattedValue::ConversionFlag::NONE: + case ASTFormattedValue::ConversionFlag::STR: + case ASTFormattedValue::ConversionFlag::REPR: + case ASTFormattedValue::ConversionFlag::ASCII: + { + auto val = stack.top(); + stack.pop(); + stack.push(new ASTFormattedValue(val, conversion_flag, nullptr)); + } + break; + case ASTFormattedValue::ConversionFlag::FMTSPEC: + { + auto format_spec = stack.top(); + stack.pop(); + auto val = stack.top(); + stack.pop(); + stack.push(new ASTFormattedValue(val, conversion_flag, format_spec)); + } + break; + default: + fprintf(stderr, "Unsupported FORMAT_VALUE_A conversion flag: %d\n", operand); + } + } + break; case Pyc::GET_AWAITABLE: { PycRef object = stack.top(); @@ -2277,6 +2317,33 @@ static void print_block(PycRef blk, PycModule* mod) { } } +void print_formatted_value(PycRef formatted_value, PycModule* mod) +{ + fputs("{", pyc_output); + print_src(formatted_value->val(), mod); + + switch (formatted_value->conversion()) + { + case ASTFormattedValue::ConversionFlag::NONE: + break; + case ASTFormattedValue::ConversionFlag::STR: + fputs("!s", pyc_output); + break; + case ASTFormattedValue::ConversionFlag::REPR: + fputs("!r", pyc_output); + break; + case ASTFormattedValue::ConversionFlag::ASCII: + fputs("!a", pyc_output); + break; + case ASTFormattedValue::ConversionFlag::FMTSPEC: + fprintf(pyc_output, ":%s", formatted_value->format_spec().cast()->object().cast()->value()); + break; + default: + fprintf(stderr, "Unsupported NODE_FORMATTEDVALUE conversion flag: %d\n", formatted_value->conversion()); + } + fputs("}", pyc_output); +} + void print_src(PycRef node, PycModule* mod) { if (node == NULL) { @@ -2367,6 +2434,31 @@ void print_src(PycRef node, PycModule* mod) } } break; + case ASTNode::NODE_FORMATTEDVALUE: + fprintf(pyc_output, "f%s", ASTFormattedValue::F_STRING_QUOTE); + print_formatted_value(node.cast(), mod); + fputs(ASTFormattedValue::F_STRING_QUOTE, pyc_output); + break; + case ASTNode::NODE_JOINEDSTR: + fprintf(pyc_output, "f%s", ASTFormattedValue::F_STRING_QUOTE); + for (const auto& val : node.cast()->values()) + { + switch (val.type()) + { + case ASTNode::NODE_FORMATTEDVALUE: + print_formatted_value(val.cast(), mod); + break; + case ASTNode::NODE_OBJECT: + // When printing a piece of the f-string, keep the quote style consistent. + // This avoids problems when ''' or """ is part of the string. + print_const(val.cast()->object(), mod, ASTFormattedValue::F_STRING_QUOTE); + break; + default: + fprintf(stderr, "Unsupported node type %d in NODE_JOINEDSTR\n", val.type()); + } + } + fputs(ASTFormattedValue::F_STRING_QUOTE, pyc_output); + break; case ASTNode::NODE_KEYWORD: fprintf(pyc_output, "%s", node.cast()->word_str()); break; diff --git a/bytecode.cpp b/bytecode.cpp index 0274808..827a0b0 100644 --- a/bytecode.cpp +++ b/bytecode.cpp @@ -147,7 +147,7 @@ bool Pyc::IsCompareArg(int opcode) return (opcode == Pyc::COMPARE_OP_A); } -void print_const(PycRef obj, PycModule* mod) +void print_const(PycRef obj, PycModule* mod, const char* parent_f_string_quote) { if (obj == NULL) { fputs("", pyc_output); @@ -156,10 +156,10 @@ void print_const(PycRef obj, PycModule* mod) switch (obj->type()) { case PycObject::TYPE_STRING: - OutputString(obj.cast(), mod->strIsUnicode() ? 'b' : 0); + OutputString(obj.cast(), mod->strIsUnicode() ? 'b' : 0, false, pyc_output, parent_f_string_quote); break; case PycObject::TYPE_UNICODE: - OutputString(obj.cast(), mod->strIsUnicode() ? 0 : 'u'); + OutputString(obj.cast(), mod->strIsUnicode() ? 0 : 'u', false, pyc_output, parent_f_string_quote); break; case PycObject::TYPE_STRINGREF: case PycObject::TYPE_INTERNED: @@ -168,9 +168,9 @@ void print_const(PycRef obj, PycModule* mod) case PycObject::TYPE_SHORT_ASCII: case PycObject::TYPE_SHORT_ASCII_INTERNED: if (mod->majorVer() >= 3) - OutputString(obj.cast(), 0); + OutputString(obj.cast(), 0, false, pyc_output, parent_f_string_quote); else - OutputString(obj.cast(), mod->strIsUnicode() ? 'b' : 0); + OutputString(obj.cast(), mod->strIsUnicode() ? 'b' : 0, false, pyc_output, parent_f_string_quote); break; case PycObject::TYPE_TUPLE: case PycObject::TYPE_SMALL_TUPLE: diff --git a/bytecode.h b/bytecode.h index 06ca0c1..a577100 100644 --- a/bytecode.h +++ b/bytecode.h @@ -29,6 +29,6 @@ bool IsCompareArg(int opcode); } -void print_const(PycRef obj, PycModule* mod); +void print_const(PycRef obj, PycModule* mod, const char* parent_f_string_quote = nullptr); void bc_next(PycBuffer& source, PycModule* mod, int& opcode, int& operand, int& pos); void bc_disasm(PycRef code, PycModule* mod, int indent); diff --git a/pyc_string.cpp b/pyc_string.cpp index 325934f..9c1b868 100644 --- a/pyc_string.cpp +++ b/pyc_string.cpp @@ -85,7 +85,7 @@ bool PycString::isEqual(PycRef obj) const return isEqual(strObj->m_value); } -void OutputString(PycRef str, char prefix, bool triple, FILE* F) +void OutputString(PycRef str, char prefix, bool triple, FILE* F, const char* parent_f_string_quote) { if (prefix != 0) fputc(prefix, F); @@ -99,23 +99,31 @@ void OutputString(PycRef str, char prefix, bool triple, FILE* F) // Determine preferred quote style (Emulate Python's method) bool useQuotes = false; - while (len--) { - if (*ch == '\'') { - useQuotes = true; - } else if (*ch == '"') { - useQuotes = false; - break; + if (!parent_f_string_quote) { + while (len--) { + if (*ch == '\'') { + useQuotes = true; + } + else if (*ch == '"') { + useQuotes = false; + break; + } + ch++; } - ch++; + } + else { + useQuotes = parent_f_string_quote[0] == '"'; } ch = str->value(); len = str->length(); // Output the string - if (triple) - fputs(useQuotes ? "\"\"\"" : "'''", F); - else - fputc(useQuotes ? '"' : '\'', F); + if (!parent_f_string_quote) { + if (triple) + fputs(useQuotes ? "\"\"\"" : "'''", F); + else + fputc(useQuotes ? '"' : '\'', F); + } while (len--) { if (*ch < 0x20 || *ch == 0x7F) { if (*ch == '\r') { @@ -144,13 +152,19 @@ void OutputString(PycRef str, char prefix, bool triple, FILE* F) fputs("\\\"", F); else if (*ch == '\\') fputs("\\\\", F); + else if (parent_f_string_quote && *ch == '{') + fputs("{{", F); + else if (parent_f_string_quote && *ch == '}') + fputs("}}", F); else fputc(*ch, F); } ch++; } - if (triple) - fputs(useQuotes ? "\"\"\"" : "'''", F); - else - fputc(useQuotes ? '"' : '\'', F); + if (!parent_f_string_quote) { + if (triple) + fputs(useQuotes ? "\"\"\"" : "'''", F); + else + fputc(useQuotes ? '"' : '\'', F); + } } diff --git a/pyc_string.h b/pyc_string.h index 2cb3b5e..e23b70b 100644 --- a/pyc_string.h +++ b/pyc_string.h @@ -31,7 +31,7 @@ private: std::string m_value; }; -void OutputString(PycRef str, char prefix = 0, - bool triple = false, FILE* F = pyc_output); +void OutputString(PycRef str, char prefix = 0, bool triple = false, + FILE* F = pyc_output, const char* parent_f_string_quote = nullptr); #endif diff --git a/tests/compiled/f-string.3.7.pyc b/tests/compiled/f-string.3.7.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f7e698207c5675e7888c0204e4eeb2d1919b4e5a GIT binary patch literal 1608 zcmb7E&2k$>5T4mx{aTh2S+<@$P^(f8|bKfUZ! z82d{p#kWPm7c`?&jWJ|4LvC}_3}&mT+Xm`($z^sK4N=C@AzL!eu)N4oumxLcnFTAU zR8hNy&8~>5(qD7VGD&H|Sfm_FBeN^Lkg&SQtys&LuAHqewOnJkrlO$HF1Q;i7hD&7 z!LdoG;Rd0O*Mx!B7Y4Qz8*VPwbA~o+xZl-#rxx@2kux(npk6hb$UCAdBK_>Gb56BBBJPJm!(O&34}5$nr@EEuI)0q4C1m;9&(~eLiOoEt0q&_A&$WH^8g8nX z2cm{g@Dns9`fIA8=qWzMPk&@cz07)x!?dE$(mD@S$CqWhj-S6^-J^t`CH&Ro>EP$T z|NP_4v(*MY$d#^u9wl4N2@);%PU`#Cfvk-mM{(%-6OhL6o9NN1LEG|nVYQ%E8wcUk ziQ&}oW&-*{ko6wy!h^d}Hhq=5*vZLfu5B6!%pICdc(SuRoN3r9_WGY+mwU3r; zorrKG{1|fn`v~suoPw9Y@e$-EIEW+$@wXy`G$G;}5FbD&CNs|oAy55XUFw}qN46s> z%MzH$dTnLPs6&uGc519I!CJ9KC_a#4`KiNWmH6L0yOAlsSw9r`+$a7-x^ zf!x085z$6Yshg%LH{J?Yt6gwY>LzrgZVkq=)B)vGAplj&##Jin$pxpounxwy)*mE% zFX8)AKicglJV>~Z%_x`(m>-AENJQr#yRPrk9eQpoLdP3k?#$;oZ|-_7n?>gV-BAqk zUfPWLc;=5{H}GNPc%HgAadr}lC?eDO5LVPSwrUfMl%zhjs^|6<&8SN71N|4a8NE;K zc~~O-7uMyW@(QmPaIsbdw@zLq)kDvj9;0(~hxXslWZ@=()Etn_Q4o03nHRgB>kE33 z_2MKv@2=U#sT1y|I7mYzT9m1DQI1_xxARh!msA}pVd4u+N+10} {a:x} {a:o} {a:e}') +print(f'''some {{braces}} {"inner literal: {braces} {{double braces}}"}''') +print(f'''f-string dict {some_dict[2]} and {{function call in expression}}: {max([1,20,3])}''') +print(f'{(lambda x: x*2)(3)}') +msg = ( + f'a {var1}' + f'cool' + f'multiline {var2}\n' + f'f-string {var3}' +) + +# Commented out because LOAD_/CALL_METHOD not supported. https://github.com/zrax/pycdc/issues/163 +# The f-string will decompile correctly, however. +# import datetime +# datetime.date(2015,9,29) +#print(f'{now:%Y-%m-%d %H:%M}') \ No newline at end of file diff --git a/tests/tokenized/f-string.txt b/tests/tokenized/f-string.txt new file mode 100644 index 0000000..b20b02c --- /dev/null +++ b/tests/tokenized/f-string.txt @@ -0,0 +1,32 @@ +var1 = 'x' +var2 = 'y' +x = 1.23456 +s1 = 1.23456 +var3 = 1.23456 +a = 15 +some_dict = { } +some_dict [ 2 ] = 3 +f'' +f'{123}' +f'{123}{var1}' +f'{123}ok' +f'ok{123}' +assigned = f'{123}' +print ( f'{123}' ) +print ( f'{123}{123}{var3}{123}' ) +print ( f'{var3}' ) +print ( f'{var3:4.5}' ) +print ( f'f-string {123}' ) +print ( f'{123}:\\s+' ) +print ( f'x{12}' * 3 ) +print ( f'f-string. \t\tformat value 0: {var1}, 1 (!s): {var2!s}, 2 (!r): {var2!r}, 3 (!a): {var2!a}, 4: {var3:6.3}, constant: {123}. End.' ) +print ( 'percent format %d ' % 444 + f'f-string {123} and {var1!s}' + f' add another f-str {var3:2.3}' + ' regular string regular string ' ) +print ( f'' ) +print ( f'"""{\'single quoted string\'} \'singles in f-string\' {"single quote \' inside"} "doubles in f-string" {\'double quoted string\'} " both \' {\'double quotes " inside\'}"""' ) +print ( f'' ) +print ( f'' ) +print ( f'{var3 * x} {var3:.2f} {var3:.5f} {x:02} {x * x:3} {x * x * x:4} {s1:>10} {a:x} {a:o} {a:e}' ) +print ( f'some {{braces}} {\'inner literal: {braces} {{double braces}}\'}' ) +print ( f'f-string dict {some_dict[2]} and {{function call in expression}}: {max([\n 1,\n 20,\n 3])}' ) +print ( f'{(lambda x: x * 2)(3)}' ) +msg = f'a {var1}coolmultiline {var2}\nf-string {var3}'