From 7bb356d00ae7f5c1eb1619533f86a9d238017ad1 Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Mon, 3 Aug 2009 23:13:50 +0000 Subject: [PATCH] Improved output formatting, and added more instruction support --- ASTNode.cpp | 14 +++- ASTNode.h | 33 ++++++-- ASTree.cpp | 209 +++++++++++++++++++++++++++++++++++++++++++-------- bytecode.cpp | 14 +--- pycdas.cpp | 18 ++--- pycdc.cpp | 4 + string.cpp | 37 +++++++-- string.h | 7 +- 8 files changed, 261 insertions(+), 75 deletions(-) diff --git a/ASTNode.cpp b/ASTNode.cpp index e28f52b..c8dc006 100644 --- a/ASTNode.cpp +++ b/ASTNode.cpp @@ -17,12 +17,22 @@ void ASTNodeList::removeFirst() } +/* ASTUnary */ +const char* ASTUnary::op_str() const +{ + static const char* s_op_strings[] = { + "+", "-", "~", "not" + }; + return s_op_strings[op()]; +} + + /* ASTBinary */ const char* ASTBinary::op_str() const { static const char* s_op_strings[] = { - "**", "*", "/", "%", "+", "-", "<<", ">>", "&", "^", "|", "//", - "" + ".", " ** ", " * ", " / ", " // ", " % ", " + ", " - ", + " << ", " >> ", " & ", " ^ ", " | ", " and ", " or " }; return s_op_strings[op()]; } diff --git a/ASTNode.h b/ASTNode.h index 5346240..93ec330 100644 --- a/ASTNode.h +++ b/ASTNode.h @@ -12,7 +12,7 @@ public: NODE_INVALID, NODE_NODELIST, NODE_OBJECT, NODE_UNARY, NODE_BINARY, NODE_COMPARE, NODE_STORE, NODE_RETURN, NODE_NAME, NODE_DELETE, NODE_FUNCTION, NODE_CLASS, NODE_CALL, NODE_IMPORT, NODE_TUPLE, - NODE_LIST, NODE_MAP, NODE_SUBSCR, + NODE_LIST, NODE_MAP, NODE_SUBSCR, NODE_PRINT, // Empty nodes NODE_PASS, NODE_LOCALS @@ -67,10 +67,19 @@ private: class ASTUnary : public ASTNode { public: - ASTUnary(PycRef operand) - : ASTNode(NODE_UNARY), m_operand(operand) { } + enum UnOp { + UN_POSITIVE, UN_NEGATIVE, UN_INVERT, UN_NOT + }; + + ASTUnary(PycRef operand, int op) + : ASTNode(NODE_UNARY), m_op(op), m_operand(operand) { } PycRef operand() const { return m_operand; } + int op() const { return m_op; } + virtual const char* op_str() const; + +protected: + int m_op; private: PycRef m_operand; @@ -80,9 +89,9 @@ private: class ASTBinary : public ASTNode { public: enum BinOp { - BIN_POWER, BIN_MULTIPLY, BIN_DIVIDE, BIN_MODULO, BIN_ADD, - BIN_SUBTRACT, BIN_LSHIFT, BIN_RSHIFT, BIN_AND, BIN_XOR, - BIN_OR, BIN_FLOOR, BIN_ATTR + BIN_ATTR, BIN_POWER, BIN_MULTIPLY, BIN_DIVIDE, BIN_FLOOR, BIN_MODULO, + BIN_ADD, BIN_SUBTRACT, BIN_LSHIFT, BIN_RSHIFT, BIN_AND, BIN_XOR, + BIN_OR, BIN_LOG_AND, BIN_LOG_OR }; ASTBinary(PycRef left, PycRef right, int op, @@ -290,4 +299,16 @@ private: PycRef m_key; }; + +class ASTPrint : public ASTNode { +public: + ASTPrint(PycRef value) + : ASTNode(NODE_PRINT), m_value(value) { } + + PycRef value() const { return m_value; } + +private: + PycRef m_value; +}; + #endif diff --git a/ASTree.cpp b/ASTree.cpp index a219307..8f9f47b 100644 --- a/ASTree.cpp +++ b/ASTree.cpp @@ -12,6 +12,10 @@ * avoid cleaning the output tree) */ static bool cleanBuild; +/* Keep track of whether we're in a print statement, so we can make + * chained prints (print x, y, z) prettier */ +static bool inPrint; + PycRef BuildFromCode(PycRef code, PycModule* mod) { PycBuffer source(code->code()->value(), code->code()->length()); @@ -355,6 +359,15 @@ PycRef BuildFromCode(PycRef code, PycModule* mod) lines.push_back(value); } break; + case (PY_1000 | Py1k::PRINT_ITEM): + case (PY_2000 | Py2k::PRINT_ITEM): + lines.push_back(new ASTPrint(stack.top())); + stack.pop(); + break; + case (PY_1000 | Py1k::PRINT_NEWLINE): + case (PY_2000 | Py2k::PRINT_NEWLINE): + lines.push_back(new ASTPrint(Node_NULL)); + break; case (PY_1000 | Py1k::RETURN_VALUE): case (PY_2000 | Py2k::RETURN_VALUE): case (PY_3000 | Py3k::RETURN_VALUE): @@ -379,6 +392,10 @@ PycRef BuildFromCode(PycRef code, PycModule* mod) stack.push(two); } break; + case (PY_1000 | Py1k::SET_LINENO): + case (PY_2000 | Py2k::SET_LINENO): + // Ignore + break; case (PY_1000 | Py1k::STORE_ATTR): case (PY_2000 | Py2k::STORE_ATTR): case (PY_3000 | Py3k::STORE_ATTR): @@ -449,6 +466,42 @@ PycRef BuildFromCode(PycRef code, PycModule* mod) } } break; + case (PY_1000 | Py1k::UNARY_INVERT): + case (PY_2000 | Py2k::UNARY_INVERT): + case (PY_3000 | Py3k::UNARY_INVERT): + { + PycRef arg = stack.top(); + stack.pop(); + stack.push(new ASTUnary(arg, ASTUnary::UN_INVERT)); + } + break; + case (PY_1000 | Py1k::UNARY_NEGATIVE): + case (PY_2000 | Py2k::UNARY_NEGATIVE): + case (PY_3000 | Py3k::UNARY_NEGATIVE): + { + PycRef arg = stack.top(); + stack.pop(); + stack.push(new ASTUnary(arg, ASTUnary::UN_NEGATIVE)); + } + break; + case (PY_1000 | Py1k::UNARY_NOT): + case (PY_2000 | Py2k::UNARY_NOT): + case (PY_3000 | Py3k::UNARY_NOT): + { + PycRef arg = stack.top(); + stack.pop(); + stack.push(new ASTUnary(arg, ASTUnary::UN_NOT)); + } + break; + case (PY_1000 | Py1k::UNARY_POSITIVE): + case (PY_2000 | Py2k::UNARY_POSITIVE): + case (PY_3000 | Py3k::UNARY_POSITIVE): + { + PycRef arg = stack.top(); + stack.pop(); + stack.push(new ASTUnary(arg, ASTUnary::UN_POSITIVE)); + } + break; default: if (mod->majorVer() == 1) fprintf(stderr, "Unsupported opcode: %s\n", Py1k::OpcodeNames[opcode & 0xFF]); @@ -465,12 +518,95 @@ PycRef BuildFromCode(PycRef code, PycModule* mod) return new ASTNodeList(lines); } -static void start_indent(int indent) +static int cmp_prec(PycRef parent, PycRef child) { + /* Determine whether the parent has higher precedence than therefore + child, so we don't flood the source code with extraneous parens. + Else we'd have expressions like (((a + b) + c) + d) when therefore + equivalent, a + b + c + d would suffice. */ + + if (parent->type() == ASTNode::NODE_UNARY && parent.cast()->op() == ASTUnary::UN_NOT) + return 1; // Always parenthesize not(x) + if (child->type() == ASTNode::NODE_BINARY) { + PycRef binChild = child.cast(); + if (parent->type() == ASTNode::NODE_BINARY) + return binChild->op() - parent.cast()->op(); + else if (parent->type() == ASTNode::NODE_COMPARE) + return (binChild->op() == ASTBinary::BIN_LOG_AND || + binChild->op() == ASTBinary::BIN_LOG_OR) ? 1 : -1; + else if (parent->type() == ASTNode::NODE_UNARY) + return (binChild->op() == ASTBinary::BIN_POWER) ? -1 : 1; + } else if (child->type() == ASTNode::NODE_UNARY) { + PycRef unChild = child.cast(); + if (parent->type() == ASTNode::NODE_BINARY) { + PycRef binParent = parent.cast(); + if (binParent->op() == ASTBinary::BIN_LOG_AND || + binParent->op() == ASTBinary::BIN_LOG_OR) + return -1; + else if (unChild->op() == ASTUnary::UN_NOT) + return 1; + else if (binParent->op() == ASTBinary::BIN_POWER) + return 1; + else + return -1; + } else if (parent->type() == ASTNode::NODE_COMPARE) { + return (unChild->op() == ASTUnary::UN_NOT) ? 1 : -1; + } else if (parent->type() == ASTNode::NODE_UNARY) { + return unChild->op() - parent.cast()->op(); + } + } else if (child->type() == ASTNode::NODE_COMPARE) { + PycRef cmpChild = child.cast(); + if (parent->type() == ASTNode::NODE_BINARY) + return (parent.cast()->op() == ASTBinary::BIN_LOG_AND || + parent.cast()->op() == ASTBinary::BIN_LOG_OR) ? -1 : 1; + else if (parent->type() == ASTNode::NODE_COMPARE) + return cmpChild->op() - parent.cast()->op(); + else if (parent->type() == ASTNode::NODE_UNARY) + return (parent.cast()->op() == ASTUnary::UN_NOT) ? -1 : 1; + } + + /* For normal nodes, don't parenthesize anything */ + return -1; +} + +static void print_ordered(PycRef parent, PycRef child, + PycModule* mod, int indent) +{ + if (child->type() == ASTNode::NODE_BINARY || + child->type() == ASTNode::NODE_COMPARE) { + if (cmp_prec(parent, child) > 0) { + printf("("); + print_src(child, mod, indent); + printf(")"); + } else { + print_src(child, mod, indent); + } + } else if (child->type() == ASTNode::NODE_UNARY) { + if (cmp_prec(parent, child) > 0) { + printf("("); + print_src(child, mod, indent); + printf(")"); + } else { + print_src(child, mod, indent); + } + } else { + print_src(child, mod, indent); + } +} + +static void start_line(int indent) +{ + if (inPrint) return; for (int i=0; i node, PycModule* mod, int indent) { switch (node->type()) { @@ -478,17 +614,9 @@ void print_src(PycRef node, PycModule* mod, int indent) case ASTNode::NODE_COMPARE: { PycRef bin = node.cast(); - if (bin->op() == ASTBinary::BIN_ATTR) { - print_src(bin->left(), mod, indent); - printf("."); - print_src(bin->right(), mod, indent); - } else { - printf("("); - print_src(bin->left(), mod, indent); - printf(" %s ", bin->op_str()); - print_src(bin->right(), mod, indent); - printf(")"); - } + print_ordered(node, bin->left(), mod, indent); + printf("%s", bin->op_str()); + print_ordered(node, bin->right(), mod, indent); } break; case ASTNode::NODE_CALL: @@ -519,7 +647,7 @@ void print_src(PycRef node, PycModule* mod, int indent) for (ASTList::value_t::const_iterator b = values.begin(); b != values.end(); ++b) { if (first) printf("\n"); else printf(",\n"); - start_indent(indent + 1); + start_line(indent + 1); print_src(*b, mod, indent + 1); first = false; } @@ -534,13 +662,13 @@ void print_src(PycRef node, PycModule* mod, int indent) for (ASTMap::map_t::const_iterator b = values.begin(); b != values.end(); ++b) { if (first) printf("\n"); else printf(",\n"); - start_indent(indent + 1); + start_line(indent + 1); print_src(b->first, mod, indent + 1); printf(": "); print_src(b->second, mod, indent + 1); first = false; } - printf("}"); + printf(" }"); } break; case ASTNode::NODE_NAME: @@ -550,9 +678,9 @@ void print_src(PycRef node, PycModule* mod, int indent) { ASTNodeList::list_t lines = node.cast()->nodes(); for (ASTNodeList::list_t::const_iterator ln = lines.begin(); ln != lines.end(); ++ln) { - start_indent(indent); + start_line(indent); print_src(*ln, mod, indent); - printf("\n"); + end_line(); } } break; @@ -568,6 +696,18 @@ void print_src(PycRef node, PycModule* mod, int indent) case ASTNode::NODE_PASS: printf("pass"); break; + case ASTNode::NODE_PRINT: + if (node.cast()->value() == Node_NULL) { + inPrint = false; + } else if (!inPrint) { + printf("print "); + print_src(node.cast()->value(), mod, indent); + inPrint = true; + } else { + printf(", "); + print_src(node.cast()->value(), mod, indent); + } + break; case ASTNode::NODE_RETURN: printf("return "); print_src(node.cast()->value(), mod, indent); @@ -578,7 +718,7 @@ void print_src(PycRef node, PycModule* mod, int indent) PycRef dest = node.cast()->dest(); if (src->type() == ASTNode::NODE_FUNCTION) { printf("\n"); - start_indent(indent); + start_line(indent); printf("def "); print_src(dest, mod, indent); printf("("); @@ -598,18 +738,23 @@ void print_src(PycRef node, PycModule* mod, int indent) print_src(code, mod, indent + 1); } else if (src->type() == ASTNode::NODE_CLASS) { printf("\n"); - start_indent(indent); + start_line(indent); printf("class "); print_src(dest, mod, indent); - printf("("); PycRef bases = src.cast()->bases().cast(); - bool first = true; - for (ASTTuple::value_t::const_iterator b = bases->values().begin(); b != bases->values().end(); ++b) { - if (!first) printf(", "); - print_src(*b, mod, indent); - first = false; + if (bases->values().size() > 0) { + printf("("); + bool first = true; + for (ASTTuple::value_t::const_iterator b = bases->values().begin(); b != bases->values().end(); ++b) { + if (!first) printf(", "); + print_src(*b, mod, indent); + first = false; + } + printf("):\n"); + } else { + // Don't put parens if there are no base classes + printf(":\n"); } - printf("):\n"); PycRef code = src.cast()->code().cast() ->func().cast()->code(); print_src(code, mod, indent + 1); @@ -679,12 +824,12 @@ void decompyle(PycRef code, PycModule* mod, int indent) { PycRef source = BuildFromCode(code, mod); + PycRef clean = source.cast(); if (cleanBuild) { // The Python compiler adds some stuff that we don't really care // about, and would add extra code for re-compilation anyway. // We strip these lines out here, and then add a "pass" statement // if the cleaned up code is empty - PycRef clean = source.cast(); if (clean->nodes().front()->type() == ASTNode::NODE_STORE) { PycRef store = clean->nodes().front().cast(); if (store->src()->type() == ASTNode::NODE_NAME && @@ -699,16 +844,18 @@ void decompyle(PycRef code, PycModule* mod, int indent) } } clean->removeLast(); // Always an extraneous return statement - - if (clean->nodes().size() == 0) - clean->append(new ASTNode(ASTNode::NODE_PASS)); } + // This is outside the clean check so a source block will always + // be compilable, even if decompylation failed. + if (clean->nodes().size() == 0) + clean->append(new ASTNode(ASTNode::NODE_PASS)); + inPrint = false; bool part1clean = cleanBuild; print_src(source, mod, indent); if (!cleanBuild || !part1clean) { - start_indent(indent); + start_line(indent); printf("# WARNING: Decompyle incomplete\n"); } } diff --git a/bytecode.cpp b/bytecode.cpp index 68a04b0..312012c 100644 --- a/bytecode.cpp +++ b/bytecode.cpp @@ -231,20 +231,10 @@ void print_const(PycRef obj, PycModule* mod) case PycObject::TYPE_STRING: case PycObject::TYPE_STRINGREF: case PycObject::TYPE_INTERNED: - if (mod->majorVer() == 3) - printf("b'"); - else - printf("'"); - OutputString(obj.cast(), QS_Single); - printf("'"); + OutputString(obj.cast(), (mod->majorVer() == 3) ? 'b' : 0); break; case PycObject::TYPE_UNICODE: - if (mod->majorVer() == 3) - printf("'"); - else - printf("u'"); - OutputString(obj.cast(), QS_Single); - printf("'"); + OutputString(obj.cast(), (mod->majorVer() == 3) ? 0 : 'u'); break; case PycObject::TYPE_TUPLE: { diff --git a/pycdas.cpp b/pycdas.cpp index 2c631c3..998367c 100644 --- a/pycdas.cpp +++ b/pycdas.cpp @@ -71,20 +71,14 @@ void output_object(PycRef obj, PycModule* mod, int indent) case PycObject::TYPE_STRING: case PycObject::TYPE_STRINGREF: case PycObject::TYPE_INTERNED: - if (mod->majorVer() == 3) - iprintf(indent, "b'"); - else - iprintf(indent, "'"); - OutputString(obj.cast(), QS_Single); - printf("'\n"); + iprintf(indent, ""); + OutputString(obj.cast(), (mod->majorVer() == 3) ? 'b' : 0); + printf("\n"); break; case PycObject::TYPE_UNICODE: - if (mod->majorVer() == 3) - iprintf(indent, "'"); - else - iprintf(indent, "u'"); - OutputString(obj.cast(), QS_Single); - printf("'\n"); + iprintf(indent, ""); + OutputString(obj.cast(), (mod->majorVer() == 3) ? 0 : 'u'); + printf("\n"); break; case PycObject::TYPE_TUPLE: { diff --git a/pycdc.cpp b/pycdc.cpp index f3230e6..3fa321d 100644 --- a/pycdc.cpp +++ b/pycdc.cpp @@ -9,6 +9,10 @@ int main(int argc, char* argv[]) PycModule mod; mod.loadFromFile(argv[1]); + if (!mod.isValid()) { + fprintf(stderr, "Could not load file %s\n", argv[1]); + return 1; + } printf("# Source Generated with Decompyle++\n"); printf("# File: %s (Python %d.%d%s)\n", argv[1], mod.majorVer(), mod.minorVer(), (mod.majorVer() < 3 && mod.isUnicode()) ? " Unicode" : ""); diff --git a/string.cpp b/string.cpp index 3c6e536..037b6fb 100644 --- a/string.cpp +++ b/string.cpp @@ -49,41 +49,64 @@ bool PycString::isEqual(const char* str) const return (strcmp(m_value, str) == 0); } -void OutputString(PycRef str, QuoteStyle style, FILE* F) +void OutputString(PycRef str, char prefix, bool triple, FILE* F) { + if (prefix != 0) + fputc(prefix, F); + const char* ch = str->value(); int len = str->length(); - if (ch == 0) + if (ch == 0) { + fprintf(F, "''"); return; + } + + // Determine preferred quote style (Emulate Python's method) + bool useQuotes = false; + while (len--) { + if (*ch == '\'') { + useQuotes = true; + } else if (*ch == '"') { + useQuotes = false; + break; + } + ch++; + } + ch = str->value(); + len = str->length(); + + // Output the string + fputc(useQuotes ? '"' : '\'', F); while (len--) { if (*ch < 0x20 || *ch == 0x7F) { if (*ch == '\r') { fprintf(F, "\\r"); } else if (*ch == '\n') { - if (style == QS_BlockSingle || style == QS_BlockDouble) + if (triple) fputc('\n', F); else fprintf(F, "\\n"); } else if (*ch == '\t') { fprintf(F, "\\t"); } else { - fprintf(F, "\\x%x", *ch); + fprintf(F, "\\x%x", (*ch & 0xFF)); } } else if (*ch >= 0x80) { if (str->type() == PycObject::TYPE_UNICODE) { // Unicode stored as UTF-8... Let the stream interpret it fputc(*ch, F); } else { - fprintf(F, "\\x%x", *ch); + fprintf(F, "\\x%x", (*ch & 0xFF)); } } else { - if (style == QS_Single && *ch == '\'') + if (!useQuotes && *ch == '\'') fprintf(F, "\\'"); - else if (style == QS_Double && *ch == '"') + else if (useQuotes && *ch == '"') fprintf(F, "\\\""); else fputc(*ch, F); } ch++; } + fputc(useQuotes ? '"' : '\'', F); } diff --git a/string.h b/string.h index 8c1cdd3..0daa4ec 100644 --- a/string.h +++ b/string.h @@ -4,10 +4,6 @@ #include "object.h" #include -enum QuoteStyle { - QS_Single, QS_Double, QS_BlockSingle, QS_BlockDouble -}; - class PycString : public PycObject { public: PycString(int type = TYPE_STRING) @@ -28,6 +24,7 @@ private: int m_length; }; -void OutputString(PycRef str, QuoteStyle style, FILE* F = stdout); +void OutputString(PycRef str, char prefix = 0, + bool triple = false, FILE* F = stdout); #endif