From e8e10f1419953b5150b63da0bc4e077a0571b15b Mon Sep 17 00:00:00 2001 From: Sahil Jain Date: Tue, 15 Jul 2025 22:33:30 +0530 Subject: [PATCH 1/6] Parse exception table --- bytecode.cpp | 15 +++++++++++++++ bytecode.h | 2 ++ pyc_code.cpp | 41 +++++++++++++++++++++++++++++++++++++++++ pyc_code.h | 4 ++++ pycdas.cpp | 10 +++++----- 5 files changed, 67 insertions(+), 5 deletions(-) diff --git a/bytecode.cpp b/bytecode.cpp index c6b7cba..0b067b2 100644 --- a/bytecode.cpp +++ b/bytecode.cpp @@ -600,3 +600,18 @@ void bc_disasm(std::ostream& pyc_output, PycRef code, PycModule* mod, pyc_output << "\n"; } } + +void bc_exceptiontable(std::ostream& pyc_output, PycRef code, + int indent) +{ + for (auto tuple: code->exceptTableEntries()) { + + for (int i=0; i(tuple) << " to " << std::get<1>(tuple); + pyc_output << " -> " << std::get<2>(tuple) << " "; + pyc_output << "[" << std::get<3>(tuple) << "] " << (std::get<4>(tuple) ? "lasti": ""); + pyc_output << "\n"; + } +} \ No newline at end of file diff --git a/bytecode.h b/bytecode.h index 7e4179e..3c0d9d3 100644 --- a/bytecode.h +++ b/bytecode.h @@ -32,3 +32,5 @@ void print_const(std::ostream& pyc_output, PycRef obj, PycModule* mod void bc_next(PycBuffer& source, PycModule* mod, int& opcode, int& operand, int& pos); void bc_disasm(std::ostream& pyc_output, PycRef code, PycModule* mod, int indent, unsigned flags); +void bc_exceptiontable(std::ostream& pyc_output, PycRef code, + int indent); diff --git a/pyc_code.cpp b/pyc_code.cpp index ba63eed..ec8be8e 100644 --- a/pyc_code.cpp +++ b/pyc_code.cpp @@ -128,3 +128,44 @@ PycRef PycCode::getCellVar(PycModule* mod, int idx) const ? m_freeVars->get(idx - m_cellVars->size()).cast() : m_cellVars->get(idx).cast(); } + +int _parse_varint(PycBuffer& data, int& pos) { + int b = data.getByte(); + pos += 1; + + int val = b & 63; + while (b & 64) { + val <<= 6; + + b = data.getByte(); + pos += 1; + + val |= (b & 63); + } + return val; +} + +std::vector PycCode::exceptTableEntries() const +{ + PycBuffer data(m_exceptTable->value(), m_exceptTable->length()); + + std::vector entries; + + int pos = 0; + while (!data.atEof()) { + + int start = _parse_varint(data, pos) * 2; + int length = _parse_varint(data, pos) * 2; + int end = start + length; + + int target = _parse_varint(data, pos) * 2; + int dl = _parse_varint(data, pos); + + int depth = dl >> 1; + bool lasti = bool(dl & 1); + + entries.emplace_back(start, end, target, depth, lasti); + } + + return entries; +} \ No newline at end of file diff --git a/pyc_code.h b/pyc_code.h index e6b2ce9..0a64ee5 100644 --- a/pyc_code.h +++ b/pyc_code.h @@ -87,6 +87,10 @@ public: m_globalsUsed.emplace_back(std::move(varname)); } + typedef std::tuple exception_table_entry_t; + + std::vector exceptTableEntries() const; + private: int m_argCount, m_posOnlyArgCount, m_kwOnlyArgCount, m_numLocals; int m_stackSize, m_flags; diff --git a/pycdas.cpp b/pycdas.cpp index b73410f..7b326b1 100644 --- a/pycdas.cpp +++ b/pycdas.cpp @@ -145,16 +145,16 @@ void output_object(PycRef obj, PycModule* mod, int indent, iputs(pyc_output, indent + 1, "[Disassembly]\n"); bc_disasm(pyc_output, codeObj, mod, indent + 2, flags); + if (mod->verCompare(3, 11) >= 0) { + iputs(pyc_output, indent + 1, "[Exception Table]\n"); + bc_exceptiontable(pyc_output, codeObj, indent+2); + } + if (mod->verCompare(1, 5) >= 0 && (flags & Pyc::DISASM_PYCODE_VERBOSE) != 0) { iprintf(pyc_output, indent + 1, "First Line: %d\n", codeObj->firstLine()); iputs(pyc_output, indent + 1, "[Line Number Table]\n"); output_object(codeObj->lnTable().cast(), mod, indent + 2, flags, pyc_output); } - - if (mod->verCompare(3, 11) >= 0 && (flags & Pyc::DISASM_PYCODE_VERBOSE) != 0) { - iputs(pyc_output, indent + 1, "[Exception Table]\n"); - output_object(codeObj->exceptTable().cast(), mod, indent + 2, flags, pyc_output); - } } break; case PycObject::TYPE_STRING: From ff0c1450b49d5228bc7078178799fe6c95bd22e9 Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Thu, 28 Aug 2025 15:14:55 -0700 Subject: [PATCH 2/6] Abort immediately when attempting to read past end of stream. No consumers of readByte() were actually checking for EOF, so they would all keep re-reading the same byte over and over again, potentially until the process runs out of memory (ref #572). --- data.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/data.cpp b/data.cpp index 1be5aa6..9509a29 100644 --- a/data.cpp +++ b/data.cpp @@ -53,8 +53,10 @@ bool PycFile::atEof() const int PycFile::getByte() { int ch = fgetc(m_stream); - if (ch == EOF) - ungetc(ch, m_stream); + if (ch == EOF) { + fputs("PycFile::getByte(): Unexpected end of stream\n", stderr); + std::exit(1); + } return ch; } @@ -67,8 +69,10 @@ int PycFile::getBuffer(int bytes, void* buffer) /* PycBuffer */ int PycBuffer::getByte() { - if (atEof()) - return EOF; + if (atEof()) { + fputs("PycBuffer::getByte(): Unexpected end of stream\n", stderr); + std::exit(1); + } int ch = (int)(*(m_buffer + m_pos)); ++m_pos; return ch & 0xFF; // Make sure it's just a byte! From 0e7be40367245a5fe75a4d8c273b0105a62d9e99 Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Thu, 28 Aug 2025 15:33:13 -0700 Subject: [PATCH 3/6] Add some extra guards against null dereference and empty std::stack pops Fixes segfault cases of #572 --- ASTree.cpp | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/ASTree.cpp b/ASTree.cpp index 3542921..c603e98 100644 --- a/ASTree.cpp +++ b/ASTree.cpp @@ -1231,8 +1231,12 @@ PycRef BuildFromCode(PycRef code, PycModule* mod) break; } - stack = stack_hist.top(); - stack_hist.pop(); + if (!stack_hist.empty()) { + stack = stack_hist.top(); + stack_hist.pop(); + } else { + fprintf(stderr, "Warning: Stack history is empty, something wrong might have happened\n"); + } PycRef prev = curblock; PycRef nil; @@ -1389,10 +1393,10 @@ PycRef BuildFromCode(PycRef code, PycModule* mod) } while (prev != nil); - curblock = blocks.top(); - - if (curblock->blktype() == ASTBlock::BLK_EXCEPT) { - curblock->setEnd(pos+offs); + if (!blocks.empty()) { + curblock = blocks.top(); + if (curblock->blktype() == ASTBlock::BLK_EXCEPT) + curblock->setEnd(pos+offs); } } break; @@ -1769,7 +1773,8 @@ PycRef BuildFromCode(PycRef code, PycModule* mod) else curblock->append(new ASTPrint(stack.top(), stream)); stack.pop(); - stream->setProcessed(); + if (stream) + stream->setProcessed(); } break; case Pyc::PRINT_NEWLINE: @@ -1797,7 +1802,8 @@ PycRef BuildFromCode(PycRef code, PycModule* mod) else curblock->append(new ASTPrint(nullptr, stream)); stack.pop(); - stream->setProcessed(); + if (stream) + stream->setProcessed(); } break; case Pyc::RAISE_VARARGS_A: From 38799f5cfb8c45930f3dc8c1c08adfc904fa02af Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Thu, 28 Aug 2025 15:58:28 -0700 Subject: [PATCH 4/6] Also check EOF in getBuffer() --- data.cpp | 16 ++++++++++------ data.h | 6 +++--- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/data.cpp b/data.cpp index 9509a29..2b560a7 100644 --- a/data.cpp +++ b/data.cpp @@ -60,9 +60,12 @@ int PycFile::getByte() return ch; } -int PycFile::getBuffer(int bytes, void* buffer) +void PycFile::getBuffer(int bytes, void* buffer) { - return (int)fread(buffer, 1, bytes, m_stream); + if (fread(buffer, 1, bytes, m_stream) != (size_t)bytes) { + fputs("PycFile::getBuffer(): Unexpected end of stream\n", stderr); + std::exit(1); + } } @@ -78,14 +81,15 @@ int PycBuffer::getByte() return ch & 0xFF; // Make sure it's just a byte! } -int PycBuffer::getBuffer(int bytes, void* buffer) +void PycBuffer::getBuffer(int bytes, void* buffer) { - if (m_pos + bytes > m_size) - bytes = m_size - m_pos; + if (m_pos + bytes > m_size) { + fputs("PycBuffer::getBuffer(): Unexpected end of stream\n", stderr); + std::exit(1); + } if (bytes != 0) memcpy(buffer, (m_buffer + m_pos), bytes); m_pos += bytes; - return bytes; } int formatted_print(std::ostream& stream, const char* format, ...) diff --git a/data.h b/data.h index 376d318..28cc85e 100644 --- a/data.h +++ b/data.h @@ -19,7 +19,7 @@ public: virtual bool atEof() const = 0; virtual int getByte() = 0; - virtual int getBuffer(int bytes, void* buffer) = 0; + virtual void getBuffer(int bytes, void* buffer) = 0; int get16(); int get32(); Pyc_INT64 get64(); @@ -34,7 +34,7 @@ public: bool atEof() const override; int getByte() override; - int getBuffer(int bytes, void* buffer) override; + void getBuffer(int bytes, void* buffer) override; private: FILE* m_stream; @@ -50,7 +50,7 @@ public: bool atEof() const override { return (m_pos == m_size); } int getByte() override; - int getBuffer(int bytes, void* buffer) override; + void getBuffer(int bytes, void* buffer) override; private: const unsigned char* m_buffer; From 577720302ed2097c3300d7af38313bfbeec4bd51 Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Thu, 28 Aug 2025 16:42:03 -0700 Subject: [PATCH 5/6] Add basic protection aginst circular references in pycdas and pycdc. This fixes the last case of fuzzer errors detected by #572. --- ASTree.cpp | 21 +++++++++++++++++++++ pycdas.cpp | 11 +++++++++++ 2 files changed, 32 insertions(+) diff --git a/ASTree.cpp b/ASTree.cpp index c603e98..6635808 100644 --- a/ASTree.cpp +++ b/ASTree.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include "ASTree.h" #include "FastStack.h" #include "pyc_numeric.h" @@ -2779,6 +2780,8 @@ void print_formatted_value(PycRef formatted_value, PycModule* pyc_output << "}"; } +static std::unordered_set node_seen; + void print_src(PycRef node, PycModule* mod, std::ostream& pyc_output) { if (node == NULL) { @@ -2787,6 +2790,12 @@ void print_src(PycRef node, PycModule* mod, std::ostream& pyc_output) return; } + if (node_seen.find((ASTNode *)node) != node_seen.end()) { + fputs("WARNING: Circular reference detected\n", stderr); + return; + } + node_seen.insert((ASTNode *)node); + switch (node->type()) { case ASTNode::NODE_BINARY: case ASTNode::NODE_COMPARE: @@ -3442,10 +3451,12 @@ void print_src(PycRef node, PycModule* mod, std::ostream& pyc_output) pyc_output << "type() << ">"; fprintf(stderr, "Unsupported Node type: %d\n", node->type()); cleanBuild = false; + node_seen.erase((ASTNode *)node); return; } cleanBuild = true; + node_seen.erase((ASTNode *)node); } bool print_docstring(PycRef obj, int indent, PycModule* mod, @@ -3462,8 +3473,16 @@ bool print_docstring(PycRef obj, int indent, PycModule* mod, return false; } +static std::unordered_set code_seen; + void decompyle(PycRef code, PycModule* mod, std::ostream& pyc_output) { + if (code_seen.find((PycCode *)code) != code_seen.end()) { + fputs("WARNING: Circular reference detected\n", stderr); + return; + } + code_seen.insert((PycCode *)code); + PycRef source = BuildFromCode(code, mod); PycRef clean = source.cast(); @@ -3557,4 +3576,6 @@ void decompyle(PycRef code, PycModule* mod, std::ostream& pyc_output) start_line(cur_indent, pyc_output); pyc_output << "# WARNING: Decompyle incomplete\n"; } + + code_seen.erase((PycCode *)code); } diff --git a/pycdas.cpp b/pycdas.cpp index b73410f..139d985 100644 --- a/pycdas.cpp +++ b/pycdas.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include "pyc_module.h" #include "pyc_numeric.h" #include "bytecode.h" @@ -73,6 +74,8 @@ static void iprintf(std::ostream& pyc_output, int indent, const char* fmt, ...) va_end(varargs); } +static std::unordered_set out_seen; + void output_object(PycRef obj, PycModule* mod, int indent, unsigned flags, std::ostream& pyc_output) { @@ -81,6 +84,12 @@ void output_object(PycRef obj, PycModule* mod, int indent, return; } + if (out_seen.find((PycObject *)obj) != out_seen.end()) { + fputs("WARNING: Circular reference detected\n", stderr); + return; + } + out_seen.insert((PycObject *)obj); + switch (obj->type()) { case PycObject::TYPE_CODE: case PycObject::TYPE_CODE2: @@ -246,6 +255,8 @@ void output_object(PycRef obj, PycModule* mod, int indent, default: iprintf(pyc_output, indent, "\n", obj->type()); } + + out_seen.erase((PycObject *)obj); } int main(int argc, char* argv[]) From d8c6fdf7112a2ddb058e6624d7707bba2952ed65 Mon Sep 17 00:00:00 2001 From: Sahil Jain Date: Sat, 30 Aug 2025 20:01:32 +0530 Subject: [PATCH 6/6] Address comments --- bytecode.cpp | 12 ++++++------ pyc_code.cpp | 14 +++++++------- pyc_code.h | 16 +++++++++++++--- 3 files changed, 26 insertions(+), 16 deletions(-) diff --git a/bytecode.cpp b/bytecode.cpp index 0b067b2..6bee279 100644 --- a/bytecode.cpp +++ b/bytecode.cpp @@ -604,14 +604,14 @@ void bc_disasm(std::ostream& pyc_output, PycRef code, PycModule* mod, void bc_exceptiontable(std::ostream& pyc_output, PycRef code, int indent) { - for (auto tuple: code->exceptTableEntries()) { + for (const auto& entry : code->exceptionTableEntries()) { for (int i=0; i(tuple) << " to " << std::get<1>(tuple); - pyc_output << " -> " << std::get<2>(tuple) << " "; - pyc_output << "[" << std::get<3>(tuple) << "] " << (std::get<4>(tuple) ? "lasti": ""); - pyc_output << "\n"; + pyc_output << entry.start_offset << " to " << entry.end_offset + << " -> " << entry.target << " [" << entry.stack_depth + << "] " << (entry.push_lasti ? "lasti": "") + << "\n"; } -} \ No newline at end of file +} diff --git a/pyc_code.cpp b/pyc_code.cpp index ec8be8e..b88f666 100644 --- a/pyc_code.cpp +++ b/pyc_code.cpp @@ -133,23 +133,23 @@ int _parse_varint(PycBuffer& data, int& pos) { int b = data.getByte(); pos += 1; - int val = b & 63; - while (b & 64) { + int val = b & 0x3F; + while (b & 0x40) { val <<= 6; b = data.getByte(); pos += 1; - val |= (b & 63); + val |= (b & 0x3F); } return val; } -std::vector PycCode::exceptTableEntries() const +std::vector PycCode::exceptionTableEntries() const { PycBuffer data(m_exceptTable->value(), m_exceptTable->length()); - std::vector entries; + std::vector entries; int pos = 0; while (!data.atEof()) { @@ -164,8 +164,8 @@ std::vector PycCode::exceptTableEntries() cons int depth = dl >> 1; bool lasti = bool(dl & 1); - entries.emplace_back(start, end, target, depth, lasti); + entries.push_back(PycExceptionTableEntry(start, end, target, depth, lasti)); } return entries; -} \ No newline at end of file +} diff --git a/pyc_code.h b/pyc_code.h index 0a64ee5..6485729 100644 --- a/pyc_code.h +++ b/pyc_code.h @@ -8,6 +8,18 @@ class PycData; class PycModule; +class PycExceptionTableEntry { +public: + int start_offset; // inclusive + int end_offset; // exclusive + int target; + int stack_depth; + bool push_lasti; + + PycExceptionTableEntry(int m_start_offset, int m_end_offset, int m_target, int m_stack_depth, bool m_push_lasti) : + start_offset(m_start_offset), end_offset(m_end_offset), target(m_target), stack_depth(m_stack_depth), push_lasti(m_push_lasti) {}; +}; + class PycCode : public PycObject { public: typedef std::vector> globals_t; @@ -87,9 +99,7 @@ public: m_globalsUsed.emplace_back(std::move(varname)); } - typedef std::tuple exception_table_entry_t; - - std::vector exceptTableEntries() const; + std::vector exceptionTableEntries() const; private: int m_argCount, m_posOnlyArgCount, m_kwOnlyArgCount, m_numLocals;