feat: file format detection, single file mode

ci: build workflow
fix: temp patch for robustness
2025-03-25 23:47:21 +08:00 · 2025-03-15 23:26:59 +08:00 · 2025-03-15 22:35:38 +08:00 · 2025-03-07 11:08:08 +08:00
9 changed files with 294 additions and 173 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -0,0 +1,55 @@
+name: Build
+on:
+  push:
+    branches: [ci-build]
+  pull_request:
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest]
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Build
+        run: |
+          mkdir build
+          cd build
+          cmake ..
+          cmake --build . --config Debug
+          cmake --install .
+
+      - name: Upload artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: pyarmor-1shot-build-${{ matrix.os }}
+          path: |
+            helpers
+            README.md
+            README-Decompyle++.markdown
+            LICENSE
+
+  windows-build:
+    runs-on: windows-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Build
+        run: |
+          mkdir build
+          cd build
+          cmake -G "MinGW Makefiles" ..
+          cmake --build . --config Debug
+          cmake --install .
+
+      - name: Upload artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: pyarmor-1shot-build-windows
+          path: |
+            helpers
+            README.md
+            README-Decompyle++.markdown
+            LICENSE
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -1,67 +0,0 @@
-# For most projects, this workflow file will not need changing; you simply need
-# to commit it to your repository.
-#
-# You may wish to alter this file to override the set of languages analyzed,
-# or to provide custom queries or build logic.
-name: "CodeQL"
-
-on:
-  push:
-    branches: [master]
-  pull_request:
-    # The branches below must be a subset of the branches above
-    branches: [master]
-  schedule:
-    - cron: '0 1 * * 2'
-
-jobs:
-  analyze:
-    name: Analyze
-    runs-on: ubuntu-latest
-    permissions:
-      actions: read
-      contents: read
-      security-events: write
-
-    strategy:
-      fail-fast: false
-      matrix:
-        language: ['cpp', 'python']
-        # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
-        # Use only 'java' to analyze code written in Java, Kotlin or both
-        # Use only 'javascript' to analyze code written in JavaScript, TypeScript or both
-        # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
-
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v3
-
-    # Initializes the CodeQL tools for scanning.
-    - name: Initialize CodeQL
-      uses: github/codeql-action/init@v2
-      with:
-        languages: ${{ matrix.language }}
-        # If you wish to specify custom queries, you can do so here or in a config file.
-        # By default, queries listed here will override any specified in a config file.
-        # Prefix the list here with "+" to use these queries and those in the config file.
-
-        # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
-        # queries: security-extended,security-and-quality
-
-    # Autobuild attempts to build any compiled languages  (C/C++, C#, Go, or Java).
-    # If this step fails, then you should remove it and run the build manually (see below)
-    - if: matrix.language == 'python'
-      name: Autobuild Python
-      uses: github/codeql-action/autobuild@v2
-
-    - if: matrix.language == 'cpp'
-      name: Build C++
-      run: |
-        mkdir build && cd build
-        cmake -DCMAKE_BUILD_TYPE=RelWithDebInfo ..
-        make
-
-    - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@v2
-      with:
-        category: "/language:${{matrix.language}}"
--- a/.github/workflows/linux-ci.yml
+++ b/.github/workflows/linux-ci.yml
@@ -1,29 +0,0 @@
-name: Linux-CI
-on:
-  push:
-    branches: [master]
-  pull_request:
-
-jobs:
-  build:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v1
-      - name: Configure and Build
-        run: |
-          (
-            mkdir build-debug && cd build-debug
-            cmake -DCMAKE_BUILD_TYPE=Debug ..
-            make -j4
-          )
-
-          (
-            mkdir build-release && cd build-release
-            cmake -DCMAKE_BUILD_TYPE=Debug ..
-            make -j4
-          )
-
-      - name: Test
-        run: |
-          cmake --build build-debug --target check
-          cmake --build build-release --target check
--- a/.github/workflows/msvc-ci.yml
+++ b/.github/workflows/msvc-ci.yml
@@ -1,29 +0,0 @@
-name: MSVC-CI
-on:
-  push:
-    branches: [master]
-  pull_request:
-
-jobs:
-  build:
-    runs-on: windows-latest
-    steps:
-      - uses: actions/checkout@v2
-      - name: Configure and Build
-        run: |
-          mkdir build
-          cd build
-          cmake -G "Visual Studio 17 2022" -A Win32 ..
-          cmake --build . --config Debug
-          cmake --build . --config Release
-
-      - name: Test
-        run: |
-          cmake --build build --config Debug --target check
-          cmake --build build --config Release --target check
-
-      - name: Upload artifact
-        uses: actions/upload-artifact@v3
-        with:
-          name: pycdc-release
-          path: build\Release\*.exe
--- a/ASTree.cpp
+++ b/ASTree.cpp
@@ -1873,6 +1873,13 @@ PycRef<ASTNode> BuildFromCode(PycRef<PycCode> code, PycModule* mod)
                            mod->verCompare(3, 11) >= 0 ? code->qualName()->value() : code->name()->value());
                    }
                }
+                // BEGIN ONESHOT TEMPORARY PATCH
+                // The problem is not here, but in the way the blocks are created
+                // Add this to avoid segfault
+                if (blocks.top() == defblock)
+                    break;
+                // END ONESHOT PATCH
+                
                PycRef<ASTBlock> tmp = curblock;
                blocks.pop();

--- a/README.md
+++ b/README.md
@@ -1,12 +1,32 @@
-# Pyarmor-Static-Unpack-1shot
+# Pyarmor Static Unpack One-Shot Tool

-Generally this project aims to statically convert (without executing) armored data - which can be regarded as an encrypted variant of pyc files - back to disassembly and (experimentally) source code. Therefore we forked the awesome [Decompyle++](https://github.com/zrax/pycdc) (aka pycdc).
+[Pyarmor](https://github.com/dashingsoft/pyarmor) is a popular tool to protect Python source code. It turns Python scripts into binary data, which can be regarded as an encrypted variant of pyc files. They can be decrypted by a shared library (pyarmor_runtime) and then executed by Python interpreter.

-Currently we are trying to support Pyarmor 8.0 - latest (9.1.1), Python 3.7 - 3.13, platforms covering Windows, Linux, macOS, and Android, with obfuscating options as many as possible. (However, we only have limited tests.)
+This project aims to convert armored data back to bytecode assembly and (experimentally) source code. We forked the awesome [Decompyle++](https://github.com/zrax/pycdc) (aka pycdc), and added some processes on it like modifying abstract syntax tree.

-If the data starts with `PY` followed by six digits, it is supported. Otherwise, if it starts with `PYARMOR`, it is generated by Pyarmor 7 or before, and is not supported.
+> [!WARNING]
+>
+> **Disassembly results are accurate, but decompiled code can be incomplete and incorrect.** [See issue #3](https://github.com/Lil-House/Pyarmor-Static-Unpack-1shot/issues/3)

-We cannot wait to make it public. Detailed write-up will be available soon. For those who are curious, temporarily you can check out [the similar work of G DATA Advanced Analytics](https://cyber.wtf/2025/02/12/unpacking-pyarmor-v8-scripts/).
+## Features
+
+### Static
+
+You don't need to execute the encrypted script. We decrypt them using the same algorithm as pyarmor_runtime. This is useful when the scripts cannot be trusted.
+
+### Universal
+
+Currently we are trying to support Pyarmor 8.0 to 9.1.2 (latest), Python 3.7 - 3.13, on all operating systems, with obfuscating options as many as possible. (However, we only have limited tests.)
+
+You can run this tool in any environment, no need to be the same with obfuscated scripts or runtime.
+
+> [!NOTE]
+> 
+> If the data starts with `PY` followed by six digits, it is supported. Otherwise, if it starts with `PYARMOR`, it is generated by Pyarmor 7 or earlier, and is not supported.
+
+### Easy to use
+
+The only thing you need to do is specifying where your obfuscated scripts are. The tool does everything like detecting armored data, parsing, disassembling, and decompiling. See "Usage" section below.

 ## Build

@@ -18,23 +38,25 @@ cmake --build .
 cmake --install .
 ```

+You can also download prebuilt binary files on [releases page](https://github.com/Lil-House/Pyarmor-Static-Unpack-1shot/releases).
+
 ## Usage

-Make sure the executable `pyarmor-1shot` (`pyarmor-1shot.exe` on Windows) exists in `helpers` directory, and run `helpers/shot.py` in Python 3 (no need to use the same version with obfuscated scripts) with the "root" directory of obfuscated scripts. It will recursively find and handle `pyarmor_runtime` and as much armored data as possible. For example:
-
 ``` bash
-$ ls /path/to/scripts
-__pycache__  pyarmor_runtime_000000  obf_main.py  plain_src.py  util.pyc  packed.so  folder_with_other_scripts  readme.unrelated
-$ python /path/to/helpers/shot.py /path/to/scripts
+python /path/to/helpers/shot.py /path/to/scripts
 ```

+Before running `shot.py`, make sure the executable `pyarmor-1shot` (`pyarmor-1shot.exe` on Windows) exists in `helpers` directory.
+
+You only need to specify the directory that contains all armored data and `pyarmor_runtime`. The tool finds and handles them recursively as much as possible.
+
 When necessary, specify a `pyarmor_runtime` executable with `-r path/to/pyarmor_runtime[.pyd|.so|.dylib]`.

 All files generated from this tool have a `.1shot.` in file names. If you want to save them in another directory instead of in-place, use `-o another/path/`. Folder structure will remain unchanged.

 Note:

- Subdirectories called `__pycache__` or `site-packages` will not be touched, and symbolic links will not be followed, to avoid repeat or forever loop and save time. If you really need them, run the script later in these directories (as "root" directory) and specify the runtime.
+- Subdirectories will not be touched if the folder name is exactly `__pycache__` or `site-packages` or it directly contains a file named `.no1shot`, and symbolic links will not be followed, to avoid repeat or forever loop and save time. If you really need them, run the script later in these directories and specify the runtime.
 - Archives, executables generated by PyInstaller and so on, must be unpacked by other tools before decrypting, or you will encounter undefined behavior.

 ## Feedback
@@ -43,10 +65,7 @@ Feel free to open an issue if you have any questions, suggestions, or problems.

 ## Todo (PR Welcome!)

- [ ] Write-up
 - [ ] Multi-platform pyarmor_runtime executable
- [ ] Accept more input forms
- [ ] Tests for different Pyarmor and Python versions
 - [ ] Support more obfuscating options
 - [ ] Use asyncio for concurrency
 - [ ] Pyarmor 7 and before (Later or never.)
--- a/helpers/detect.py
+++ b/helpers/detect.py
@@ -0,0 +1,157 @@
+import logging
+import os
+from typing import List, Tuple
+
+
+def ascii_ratio(data: bytes) -> float:
+    return sum(32 <= c < 127 for c in data) / len(data)
+
+
+def source_as_file(file_path: str) -> List[bytes] | None:
+    try:
+        with open(file_path, 'r') as f:
+            co = compile(f.read(), '<str>', 'exec')
+            data = [i for i in co.co_consts if type(i) is bytes
+                    and i.startswith(b'PY00') and len(i) > 64]
+            return data
+    except:
+        return None
+
+
+def source_as_lines(file_path: str) -> List[bytes] | None:
+    data = []
+    try:
+        with open(file_path, 'r') as f:
+            for line in f:
+                try:
+                    co = compile(line, '<str>', 'exec')
+                    data.extend([i for i in co.co_consts if type(i) is bytes
+                                 and i.startswith(b'PY00') and len(i) > 64])
+                except:
+                    # ignore not compilable lines
+                    pass
+    except:
+        return None
+    return data
+
+
+def find_data_from_bytes(data: bytes, max_count=-1) -> List[bytes]:
+    result = []
+    idx = 0
+    while len(result) != max_count:
+        idx = data.find(b'PY00')
+        if idx == -1:
+            break
+        data = data[idx:]
+        if len(data) < 64:
+            break
+        header_len = int.from_bytes(data[28:32], 'little')
+        body_len = int.from_bytes(data[32:36], 'little')
+        if header_len > 256 or body_len > 0xFFFFF or header_len + body_len > len(data):
+            # compressed or coincident, skip
+            data = data[5:]
+            continue
+        result.append(data[:header_len + body_len])
+
+        # maybe followed by data for other Python versions from the same file,
+        # we do not extract them
+        followed_by_another_equivalent = int.from_bytes(
+            data[56:60], 'little') != 0
+        data = data[header_len + body_len:]
+        while followed_by_another_equivalent \
+                and data.startswith(b'PY00') \
+                and len(data) >= 64:
+            header_len = int.from_bytes(data[28:32], 'little')
+            body_len = int.from_bytes(data[32:36], 'little')
+            followed_by_another_equivalent = int.from_bytes(
+                data[56:60], 'little') != 0
+            data = data[header_len + body_len:]
+    return result
+
+
+def nuitka_package(head: bytes, relative_path: str) -> None | List[Tuple[str, bytes]]:
+    first_occurrence = head.find(b'PY00')
+    if first_occurrence == -1:
+        return None
+    last_dot_bytecode = head.rfind(b'.bytecode\x00', 0, first_occurrence)
+    if last_dot_bytecode == -1:
+        return None
+    length = int.from_bytes(
+        head[last_dot_bytecode-4:last_dot_bytecode], 'little')
+    end = last_dot_bytecode + length
+    cur = last_dot_bytecode
+    result = []
+    while cur < end:
+        module_name_len = head.find(b'\x00', cur, end) - cur
+        module_name = head[cur:cur + module_name_len].decode('utf-8')
+        cur += module_name_len + 1
+        module_len = int.from_bytes(head[cur:cur + 4], 'little')
+        cur += 4
+        module_data = find_data_from_bytes(head[cur:cur + module_len], 1)
+        if module_data:
+            result.append((os.path.join(relative_path.rstrip(
+                '/\\') + '.1shot.ext', module_name), module_data[0]))
+        cur += module_len
+    if result:
+        logger = logging.getLogger('detect')
+        logger.info(f'Found data in Nuitka package: {relative_path}')
+        return result
+    return None
+
+
+def detect_process(file_path: str, relative_path: str) -> None | List[Tuple[str, bytes]]:
+    '''
+    Returns a list of (relative_path, bytes_raw) tuples, or None.
+    Do not raise exceptions.
+    '''
+    logger = logging.getLogger('detect')
+
+    try:
+        with open(file_path, 'rb') as f:
+            head = f.read(16 * 1024 * 1024)
+    except:
+        logger.error(f'Failed to read file: {relative_path}')
+        return None
+
+    if b'__pyarmor__' not in head:
+        # no need to dig deeper
+        return None
+
+    if ascii_ratio(head[:2048]) >= 0.9:
+        # the whole file may not be compiled, but we can still try some lines;
+        # None means failure (then we make another try),
+        # empty list means success but no data found (then we skip this file)
+        result = source_as_file(file_path)
+        if result is None:
+            result = source_as_lines(file_path)
+        if result is None:
+            return None
+
+        match len(result):
+            case 0:
+                return None
+            case 1:
+                logger.info(f'Found data in source: {relative_path}')
+                return [(relative_path, result[0])]
+            case _:
+                logger.info(f'Found data in source: {relative_path}')
+                return [(f'{relative_path}__{i}', result[i]) for i in range(len(result))]
+
+    # binary file
+    # ignore data after 16MB, before we have a reason to read more
+
+    if b'Error, corrupted constants object' in head:
+        # an interesting special case: packer put armored data in a Nuitka package
+        # we can know the exact module names, instead of adding boring __0, __1, ...
+        return nuitka_package(head, relative_path)
+
+    result = find_data_from_bytes(head)
+    match len(result):
+        case 0:
+            return None
+        case 1:
+            logger.info(f'Found data in binary: {relative_path}')
+            return [(relative_path, result[0])]
+        case _:
+            logger.info(f'Found data in binary: {relative_path}')
+            return [(f'{relative_path}__{i}', result[i]) for i in range(len(result))]
--- a/helpers/shot.py
+++ b/helpers/shot.py
@@ -5,6 +5,7 @@ import os
 import subprocess
 from typing import Dict, List, Tuple

+from detect import detect_process
 from runtime import RuntimeInfo


@@ -157,6 +158,7 @@ def main():

    if args.runtime:
        specified_runtime = RuntimeInfo(args.runtime)
+        print(specified_runtime)
        runtimes = {specified_runtime.serial_number: specified_runtime}
    else:
        specified_runtime = None
@@ -167,23 +169,50 @@ def main():
    if args.output_dir and not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

+    if os.path.isfile(args.directory):
+        if specified_runtime is None:
+            logger.error('Please specify `pyarmor_runtime` file by `-r` if input is a file')
+            return
+        logger.info('Single file mode')
+        result = detect_process(args.directory, args.directory)
+        if result is None:
+            logger.error('No armored data found')
+            return
+        sequences.extend(result)
+        decrypt_process(runtimes, sequences, args)
+        return  # single file mode ends here
+
    dir_path: str
    dirs: List[str]
    files: List[str]
    for dir_path, dirs, files in os.walk(args.directory, followlinks=False):
+        if '.no1shot' in files:
+            logger.info(f'Skipping {dir_path} because of `.no1shot`')
+            dirs.clear()
+            files.clear()
+            continue
        for d in ['__pycache__', 'site-packages']:
            if d in dirs:
                dirs.remove(d)
        for file_name in files:
            if '.1shot.' in file_name:
                continue
-            handled = False
+
            file_path = os.path.join(dir_path, file_name)
            relative_path = os.path.relpath(file_path, args.directory)

+            if file_name.endswith('.pyz'):
+                with open(file_path, 'rb') as f:
+                    head = f.read(16 * 1024 * 1024)
+                if b'PY00' in head \
+                        and (not os.path.exists(file_path + '_extracted')
+                             or len(os.listdir(file_path + '_extracted')) == 0):
+                    logger.error(
+                        f'A PYZ file containing armored data is detected, but the PYZ file has not been extracted by other tools. This error is not a problem with this tool. If the folder is extracted by Pyinstxtractor, please read the output information of Pyinstxtractor carefully. ({relative_path})')
+                continue
+
            # is pyarmor_runtime?
-            if not handled \
-                    and specified_runtime is None \
+            if specified_runtime is None \
                    and file_name.startswith('pyarmor_runtime') \
                    and file_name.endswith(('.pyd', '.so', '.dylib')):
                try:
@@ -192,40 +221,13 @@ def main():
                    logger.info(
                        f'Found new runtime: {new_runtime.serial_number} ({file_path})')
                    print(new_runtime)
-                    handled = True
+                    continue
                except:
                    pass

-            try:
-                with open(file_path, 'rb') as f:
-                    beacon = f.read(16 * 1024 * 1024)
-            except:
-                logger.error(f'Failed to read file: {relative_path}')
-                continue
-
-            # is UTF-8 source?
-            # TODO: only support natural one line now
-            if not handled and b'__pyarmor__(__name__, __file__,' in beacon:
-                try:
-                    with open(file_path, 'r') as f:
-                        for line in f:
-                            if line.startswith('__pyarmor__(') and line.rstrip().endswith(')'):
-                                co = compile(line, '<str>', 'exec')
-                                bytes_raw = co.co_consts[0]
-                                assert type(bytes_raw) is bytes
-                                assert bytes_raw.startswith(b'PY')
-                                assert len(bytes_raw) > 64
-                                break
-                    logger.info(f'Found data in source: {relative_path}')
-                    # FIXME: bytes_raw can be kept from last iteration
-                    sequences.append((relative_path, bytes_raw))
-                    del bytes_raw
-                    handled = True
-                except Exception as e:
-                    logger.error(f'Assume source, but {e} ({file_path})')
-
-            # TODO: is Nuitka package?
-            # TODO: is pyc or single marshalled binary?
+            result = detect_process(file_path, relative_path)
+            if result is not None:
+                sequences.extend(result)

    if not runtimes:
        logger.error('No runtime found')
--- a/pyarmor-1shot.cpp
+++ b/pyarmor-1shot.cpp
@@ -86,6 +86,9 @@ int main(int argc, char* argv[])
        return 1;
    }

+    das_out_file.flush();
+    das_out_file.close();
+
    dc_out_file << "# Source Generated with Decompyle++\n";
    formatted_print(dc_out_file, "# File: %s (Python %d.%d%s)\n\n", dispname,
                    mod.majorVer(), mod.minorVer(),
@@ -97,5 +100,8 @@ int main(int argc, char* argv[])
        return 1;
    }

+    dc_out_file.flush();
+    dc_out_file.close();
+
    return 0;
 }
Author	SHA1	Message	Date
Lil-Ran	e7397945d3	feat: file format detection, single file mode	2025-03-25 23:47:21 +08:00
Lil-Ran	48cab893c9	ci: build workflow	2025-03-15 23:26:59 +08:00
Lil-Ran	64df67ac8a	fix: temp patch for robustness	2025-03-15 22:35:38 +08:00
Lil-Ran	d9e00c3762	docs: update readme	2025-03-07 11:08:08 +08:00