feat: file format detection, single file mode

2025-03-25 23:47:21 +08:00
parent 48cab893c9
commit e7397945d3
3 changed files with 195 additions and 37 deletions
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@ This project aims to convert armored data back to bytecode assembly and (experim

 > [!WARNING]
 >
-> **Disassembly results are accurate, but decompiled code can be incomplete and incorrect.** Bytecode has changed a lot in recent Python versions, while pycdc has limited support for bytecode in newer versions.
+> **Disassembly results are accurate, but decompiled code can be incomplete and incorrect.** [See issue #3](https://github.com/Lil-House/Pyarmor-Static-Unpack-1shot/issues/3)

 ## Features

@@ -56,7 +56,7 @@ All files generated from this tool have a `.1shot.` in file names. If you want t

 Note:

- Subdirectories called `__pycache__` or `site-packages` will not be touched, and symbolic links will not be followed, to avoid repeat or forever loop and save time. If you really need them, run the script later in these directories and specify the runtime.
+- Subdirectories will not be touched if the folder name is exactly `__pycache__` or `site-packages` or it directly contains a file named `.no1shot`, and symbolic links will not be followed, to avoid repeat or forever loop and save time. If you really need them, run the script later in these directories and specify the runtime.
 - Archives, executables generated by PyInstaller and so on, must be unpacked by other tools before decrypting, or you will encounter undefined behavior.

 ## Feedback
@@ -66,7 +66,6 @@ Feel free to open an issue if you have any questions, suggestions, or problems.
 ## Todo (PR Welcome!)

 - [ ] Multi-platform pyarmor_runtime executable
- [ ] Accept more input forms
 - [ ] Support more obfuscating options
 - [ ] Use asyncio for concurrency
 - [ ] Pyarmor 7 and before (Later or never.)
--- a/helpers/detect.py
+++ b/helpers/detect.py
@@ -0,0 +1,157 @@
+import logging
+import os
+from typing import List, Tuple
+
+
+def ascii_ratio(data: bytes) -> float:
+    return sum(32 <= c < 127 for c in data) / len(data)
+
+
+def source_as_file(file_path: str) -> List[bytes] | None:
+    try:
+        with open(file_path, 'r') as f:
+            co = compile(f.read(), '<str>', 'exec')
+            data = [i for i in co.co_consts if type(i) is bytes
+                    and i.startswith(b'PY00') and len(i) > 64]
+            return data
+    except:
+        return None
+
+
+def source_as_lines(file_path: str) -> List[bytes] | None:
+    data = []
+    try:
+        with open(file_path, 'r') as f:
+            for line in f:
+                try:
+                    co = compile(line, '<str>', 'exec')
+                    data.extend([i for i in co.co_consts if type(i) is bytes
+                                 and i.startswith(b'PY00') and len(i) > 64])
+                except:
+                    # ignore not compilable lines
+                    pass
+    except:
+        return None
+    return data
+
+
+def find_data_from_bytes(data: bytes, max_count=-1) -> List[bytes]:
+    result = []
+    idx = 0
+    while len(result) != max_count:
+        idx = data.find(b'PY00')
+        if idx == -1:
+            break
+        data = data[idx:]
+        if len(data) < 64:
+            break
+        header_len = int.from_bytes(data[28:32], 'little')
+        body_len = int.from_bytes(data[32:36], 'little')
+        if header_len > 256 or body_len > 0xFFFFF or header_len + body_len > len(data):
+            # compressed or coincident, skip
+            data = data[5:]
+            continue
+        result.append(data[:header_len + body_len])
+
+        # maybe followed by data for other Python versions from the same file,
+        # we do not extract them
+        followed_by_another_equivalent = int.from_bytes(
+            data[56:60], 'little') != 0
+        data = data[header_len + body_len:]
+        while followed_by_another_equivalent \
+                and data.startswith(b'PY00') \
+                and len(data) >= 64:
+            header_len = int.from_bytes(data[28:32], 'little')
+            body_len = int.from_bytes(data[32:36], 'little')
+            followed_by_another_equivalent = int.from_bytes(
+                data[56:60], 'little') != 0
+            data = data[header_len + body_len:]
+    return result
+
+
+def nuitka_package(head: bytes, relative_path: str) -> None | List[Tuple[str, bytes]]:
+    first_occurrence = head.find(b'PY00')
+    if first_occurrence == -1:
+        return None
+    last_dot_bytecode = head.rfind(b'.bytecode\x00', 0, first_occurrence)
+    if last_dot_bytecode == -1:
+        return None
+    length = int.from_bytes(
+        head[last_dot_bytecode-4:last_dot_bytecode], 'little')
+    end = last_dot_bytecode + length
+    cur = last_dot_bytecode
+    result = []
+    while cur < end:
+        module_name_len = head.find(b'\x00', cur, end) - cur
+        module_name = head[cur:cur + module_name_len].decode('utf-8')
+        cur += module_name_len + 1
+        module_len = int.from_bytes(head[cur:cur + 4], 'little')
+        cur += 4
+        module_data = find_data_from_bytes(head[cur:cur + module_len], 1)
+        if module_data:
+            result.append((os.path.join(relative_path.rstrip(
+                '/\\') + '.1shot.ext', module_name), module_data[0]))
+        cur += module_len
+    if result:
+        logger = logging.getLogger('detect')
+        logger.info(f'Found data in Nuitka package: {relative_path}')
+        return result
+    return None
+
+
+def detect_process(file_path: str, relative_path: str) -> None | List[Tuple[str, bytes]]:
+    '''
+    Returns a list of (relative_path, bytes_raw) tuples, or None.
+    Do not raise exceptions.
+    '''
+    logger = logging.getLogger('detect')
+
+    try:
+        with open(file_path, 'rb') as f:
+            head = f.read(16 * 1024 * 1024)
+    except:
+        logger.error(f'Failed to read file: {relative_path}')
+        return None
+
+    if b'__pyarmor__' not in head:
+        # no need to dig deeper
+        return None
+
+    if ascii_ratio(head[:2048]) >= 0.9:
+        # the whole file may not be compiled, but we can still try some lines;
+        # None means failure (then we make another try),
+        # empty list means success but no data found (then we skip this file)
+        result = source_as_file(file_path)
+        if result is None:
+            result = source_as_lines(file_path)
+        if result is None:
+            return None
+
+        match len(result):
+            case 0:
+                return None
+            case 1:
+                logger.info(f'Found data in source: {relative_path}')
+                return [(relative_path, result[0])]
+            case _:
+                logger.info(f'Found data in source: {relative_path}')
+                return [(f'{relative_path}__{i}', result[i]) for i in range(len(result))]
+
+    # binary file
+    # ignore data after 16MB, before we have a reason to read more
+
+    if b'Error, corrupted constants object' in head:
+        # an interesting special case: packer put armored data in a Nuitka package
+        # we can know the exact module names, instead of adding boring __0, __1, ...
+        return nuitka_package(head, relative_path)
+
+    result = find_data_from_bytes(head)
+    match len(result):
+        case 0:
+            return None
+        case 1:
+            logger.info(f'Found data in binary: {relative_path}')
+            return [(relative_path, result[0])]
+        case _:
+            logger.info(f'Found data in binary: {relative_path}')
+            return [(f'{relative_path}__{i}', result[i]) for i in range(len(result))]
--- a/helpers/shot.py
+++ b/helpers/shot.py
@@ -5,6 +5,7 @@ import os
 import subprocess
 from typing import Dict, List, Tuple

+from detect import detect_process
 from runtime import RuntimeInfo


@@ -157,6 +158,7 @@ def main():

    if args.runtime:
        specified_runtime = RuntimeInfo(args.runtime)
+        print(specified_runtime)
        runtimes = {specified_runtime.serial_number: specified_runtime}
    else:
        specified_runtime = None
@@ -167,23 +169,50 @@ def main():
    if args.output_dir and not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

+    if os.path.isfile(args.directory):
+        if specified_runtime is None:
+            logger.error('Please specify `pyarmor_runtime` file by `-r` if input is a file')
+            return
+        logger.info('Single file mode')
+        result = detect_process(args.directory, args.directory)
+        if result is None:
+            logger.error('No armored data found')
+            return
+        sequences.extend(result)
+        decrypt_process(runtimes, sequences, args)
+        return  # single file mode ends here
+
    dir_path: str
    dirs: List[str]
    files: List[str]
    for dir_path, dirs, files in os.walk(args.directory, followlinks=False):
+        if '.no1shot' in files:
+            logger.info(f'Skipping {dir_path} because of `.no1shot`')
+            dirs.clear()
+            files.clear()
+            continue
        for d in ['__pycache__', 'site-packages']:
            if d in dirs:
                dirs.remove(d)
        for file_name in files:
            if '.1shot.' in file_name:
                continue
-            handled = False
+
            file_path = os.path.join(dir_path, file_name)
            relative_path = os.path.relpath(file_path, args.directory)

+            if file_name.endswith('.pyz'):
+                with open(file_path, 'rb') as f:
+                    head = f.read(16 * 1024 * 1024)
+                if b'PY00' in head \
+                        and (not os.path.exists(file_path + '_extracted')
+                             or len(os.listdir(file_path + '_extracted')) == 0):
+                    logger.error(
+                        f'A PYZ file containing armored data is detected, but the PYZ file has not been extracted by other tools. This error is not a problem with this tool. If the folder is extracted by Pyinstxtractor, please read the output information of Pyinstxtractor carefully. ({relative_path})')
+                continue
+
            # is pyarmor_runtime?
-            if not handled \
-                    and specified_runtime is None \
+            if specified_runtime is None \
                    and file_name.startswith('pyarmor_runtime') \
                    and file_name.endswith(('.pyd', '.so', '.dylib')):
                try:
@@ -192,40 +221,13 @@ def main():
                    logger.info(
                        f'Found new runtime: {new_runtime.serial_number} ({file_path})')
                    print(new_runtime)
-                    handled = True
+                    continue
                except:
                    pass

-            try:
-                with open(file_path, 'rb') as f:
-                    beacon = f.read(16 * 1024 * 1024)
-            except:
-                logger.error(f'Failed to read file: {relative_path}')
-                continue
-
-            # is UTF-8 source?
-            # TODO: only support natural one line now
-            if not handled and b'__pyarmor__(__name__, __file__,' in beacon:
-                try:
-                    with open(file_path, 'r') as f:
-                        for line in f:
-                            if line.startswith('__pyarmor__(') and line.rstrip().endswith(')'):
-                                co = compile(line, '<str>', 'exec')
-                                bytes_raw = co.co_consts[0]
-                                assert type(bytes_raw) is bytes
-                                assert bytes_raw.startswith(b'PY')
-                                assert len(bytes_raw) > 64
-                                break
-                    logger.info(f'Found data in source: {relative_path}')
-                    # FIXME: bytes_raw can be kept from last iteration
-                    sequences.append((relative_path, bytes_raw))
-                    del bytes_raw
-                    handled = True
-                except Exception as e:
-                    logger.error(f'Assume source, but {e} ({file_path})')
-
-            # TODO: is Nuitka package?
-            # TODO: is pyc or single marshalled binary?
+            result = detect_process(file_path, relative_path)
+            if result is not None:
+                sequences.extend(result)

    if not runtimes:
        logger.error('No runtime found')