feat: file format detection, single file mode

2025-03-25 23:47:21 +08:00
parent 48cab893c9
commit e7397945d3
3 changed files with 195 additions and 37 deletions
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@ This project aims to convert armored data back to bytecode assembly and (experim
 > [!WARNING]
 >
-> **Disassembly results are accurate, but decompiled code can be incomplete and incorrect.** Bytecode has changed a lot in recent Python versions, while pycdc has limited support for bytecode in newer versions.
+> **Disassembly results are accurate, but decompiled code can be incomplete and incorrect.** [See issue #3](https://github.com/Lil-House/Pyarmor-Static-Unpack-1shot/issues/3)
 ## Features
@@ -56,7 +56,7 @@ All files generated from this tool have a `.1shot.` in file names. If you want t
 Note:
- Subdirectories called `__pycache__` or `site-packages` will not be touched, and symbolic links will not be followed, to avoid repeat or forever loop and save time. If you really need them, run the script later in these directories and specify the runtime.
+- Subdirectories will not be touched if the folder name is exactly `__pycache__` or `site-packages` or it directly contains a file named `.no1shot`, and symbolic links will not be followed, to avoid repeat or forever loop and save time. If you really need them, run the script later in these directories and specify the runtime.
 - Archives, executables generated by PyInstaller and so on, must be unpacked by other tools before decrypting, or you will encounter undefined behavior.
 ## Feedback
@@ -66,7 +66,6 @@ Feel free to open an issue if you have any questions, suggestions, or problems.
 ## Todo (PR Welcome!)
 - [ ] Multi-platform pyarmor_runtime executable
 - [ ] Accept more input forms
 - [ ] Support more obfuscating options
 - [ ] Use asyncio for concurrency
 - [ ] Pyarmor 7 and before (Later or never.)
--- a/helpers/detect.py
+++ b/helpers/detect.py
@@ -0,0 +1,157 @@
 import logging
 import os
 from typing import List, Tuple
 def ascii_ratio(data: bytes) -> float:
    return sum(32 <= c < 127 for c in data) / len(data)
 def source_as_file(file_path: str) -> List[bytes] | None:
    try:
        with open(file_path, 'r') as f:
            co = compile(f.read(), '<str>', 'exec')
            data = [i for i in co.co_consts if type(i) is bytes
                    and i.startswith(b'PY00') and len(i) > 64]
            return data
    except:
        return None
 def source_as_lines(file_path: str) -> List[bytes] | None:
    data = []
    try:
        with open(file_path, 'r') as f:
            for line in f:
                try:
                    co = compile(line, '<str>', 'exec')
                    data.extend([i for i in co.co_consts if type(i) is bytes
                                 and i.startswith(b'PY00') and len(i) > 64])
                except:
                    # ignore not compilable lines
                    pass
    except:
        return None
    return data
 def find_data_from_bytes(data: bytes, max_count=-1) -> List[bytes]:
    result = []
    idx = 0
    while len(result) != max_count:
        idx = data.find(b'PY00')
        if idx == -1:
            break
        data = data[idx:]
        if len(data) < 64:
            break
        header_len = int.from_bytes(data[28:32], 'little')
        body_len = int.from_bytes(data[32:36], 'little')
        if header_len > 256 or body_len > 0xFFFFF or header_len + body_len > len(data):
            # compressed or coincident, skip
            data = data[5:]
            continue
        result.append(data[:header_len + body_len])
        # maybe followed by data for other Python versions from the same file,
        # we do not extract them
        followed_by_another_equivalent = int.from_bytes(
            data[56:60], 'little') != 0
        data = data[header_len + body_len:]
        while followed_by_another_equivalent \
                and data.startswith(b'PY00') \
                and len(data) >= 64:
            header_len = int.from_bytes(data[28:32], 'little')
            body_len = int.from_bytes(data[32:36], 'little')
            followed_by_another_equivalent = int.from_bytes(
                data[56:60], 'little') != 0
            data = data[header_len + body_len:]
    return result
 def nuitka_package(head: bytes, relative_path: str) -> None | List[Tuple[str, bytes]]:
    first_occurrence = head.find(b'PY00')
    if first_occurrence == -1:
        return None
    last_dot_bytecode = head.rfind(b'.bytecode\x00', 0, first_occurrence)
    if last_dot_bytecode == -1:
        return None
    length = int.from_bytes(
        head[last_dot_bytecode-4:last_dot_bytecode], 'little')
    end = last_dot_bytecode + length
    cur = last_dot_bytecode
    result = []
    while cur < end:
        module_name_len = head.find(b'\x00', cur, end) - cur
        module_name = head[cur:cur + module_name_len].decode('utf-8')
        cur += module_name_len + 1
        module_len = int.from_bytes(head[cur:cur + 4], 'little')
        cur += 4
        module_data = find_data_from_bytes(head[cur:cur + module_len], 1)
        if module_data:
            result.append((os.path.join(relative_path.rstrip(
                '/\\') + '.1shot.ext', module_name), module_data[0]))
        cur += module_len
    if result:
        logger = logging.getLogger('detect')
        logger.info(f'Found data in Nuitka package: {relative_path}')
        return result
    return None
 def detect_process(file_path: str, relative_path: str) -> None | List[Tuple[str, bytes]]:
    '''
    Returns a list of (relative_path, bytes_raw) tuples, or None.
    Do not raise exceptions.
    '''
    logger = logging.getLogger('detect')
    try:
        with open(file_path, 'rb') as f:
            head = f.read(16 * 1024 * 1024)
    except:
        logger.error(f'Failed to read file: {relative_path}')
        return None
    if b'__pyarmor__' not in head:
        # no need to dig deeper
        return None
    if ascii_ratio(head[:2048]) >= 0.9:
        # the whole file may not be compiled, but we can still try some lines;
        # None means failure (then we make another try),
        # empty list means success but no data found (then we skip this file)
        result = source_as_file(file_path)
        if result is None:
            result = source_as_lines(file_path)
        if result is None:
            return None
        match len(result):
            case 0:
                return None
            case 1:
                logger.info(f'Found data in source: {relative_path}')
                return [(relative_path, result[0])]
            case _:
                logger.info(f'Found data in source: {relative_path}')
                return [(f'{relative_path}__{i}', result[i]) for i in range(len(result))]
    # binary file
    # ignore data after 16MB, before we have a reason to read more
    if b'Error, corrupted constants object' in head:
        # an interesting special case: packer put armored data in a Nuitka package
        # we can know the exact module names, instead of adding boring __0, __1, ...
        return nuitka_package(head, relative_path)
    result = find_data_from_bytes(head)
    match len(result):
        case 0:
            return None
        case 1:
            logger.info(f'Found data in binary: {relative_path}')
            return [(relative_path, result[0])]
        case _:
            logger.info(f'Found data in binary: {relative_path}')
            return [(f'{relative_path}__{i}', result[i]) for i in range(len(result))]
--- a/helpers/shot.py
+++ b/helpers/shot.py
@@ -5,6 +5,7 @@ import os
 import subprocess
 from typing import Dict, List, Tuple
 from detect import detect_process
 from runtime import RuntimeInfo
@@ -157,6 +158,7 @@ def main():
    if args.runtime:
        specified_runtime = RuntimeInfo(args.runtime)
        print(specified_runtime)
        runtimes = {specified_runtime.serial_number: specified_runtime}
    else:
        specified_runtime = None
@@ -167,23 +169,50 @@ def main():
    if args.output_dir and not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    if os.path.isfile(args.directory):
        if specified_runtime is None:
            logger.error('Please specify `pyarmor_runtime` file by `-r` if input is a file')
            return
        logger.info('Single file mode')
        result = detect_process(args.directory, args.directory)
        if result is None:
            logger.error('No armored data found')
            return
        sequences.extend(result)
        decrypt_process(runtimes, sequences, args)
        return  # single file mode ends here
    dir_path: str
    dirs: List[str]
    files: List[str]
    for dir_path, dirs, files in os.walk(args.directory, followlinks=False):
        if '.no1shot' in files:
            logger.info(f'Skipping {dir_path} because of `.no1shot`')
            dirs.clear()
            files.clear()
            continue
        for d in ['__pycache__', 'site-packages']:
            if d in dirs:
                dirs.remove(d)
        for file_name in files:
            if '.1shot.' in file_name:
                continue
-            handled = False
+
            file_path = os.path.join(dir_path, file_name)
            relative_path = os.path.relpath(file_path, args.directory)
            if file_name.endswith('.pyz'):
                with open(file_path, 'rb') as f:
                    head = f.read(16 * 1024 * 1024)
                if b'PY00' in head \
                        and (not os.path.exists(file_path + '_extracted')
                             or len(os.listdir(file_path + '_extracted')) == 0):
                    logger.error(
                        f'A PYZ file containing armored data is detected, but the PYZ file has not been extracted by other tools. This error is not a problem with this tool. If the folder is extracted by Pyinstxtractor, please read the output information of Pyinstxtractor carefully. ({relative_path})')
                continue
            # is pyarmor_runtime?
-            if not handled \
+            if specified_runtime is None \
                    and specified_runtime is None \
                    and file_name.startswith('pyarmor_runtime') \
                    and file_name.endswith(('.pyd', '.so', '.dylib')):
                try:
@@ -192,40 +221,13 @@ def main():
                    logger.info(
                        f'Found new runtime: {new_runtime.serial_number} ({file_path})')
                    print(new_runtime)
-                    handled = True
+                    continue
                except:
                    pass
-            try:
+            result = detect_process(file_path, relative_path)
-                with open(file_path, 'rb') as f:
+            if result is not None:
-                    beacon = f.read(16 * 1024 * 1024)
+                sequences.extend(result)
            except:
                logger.error(f'Failed to read file: {relative_path}')
                continue
            # is UTF-8 source?
            # TODO: only support natural one line now
            if not handled and b'__pyarmor__(__name__, __file__,' in beacon:
                try:
                    with open(file_path, 'r') as f:
                        for line in f:
                            if line.startswith('__pyarmor__(') and line.rstrip().endswith(')'):
                                co = compile(line, '<str>', 'exec')
                                bytes_raw = co.co_consts[0]
                                assert type(bytes_raw) is bytes
                                assert bytes_raw.startswith(b'PY')
                                assert len(bytes_raw) > 64
                                break
                    logger.info(f'Found data in source: {relative_path}')
                    # FIXME: bytes_raw can be kept from last iteration
                    sequences.append((relative_path, bytes_raw))
                    del bytes_raw
                    handled = True
                except Exception as e:
                    logger.error(f'Assume source, but {e} ({file_path})')
            # TODO: is Nuitka package?
            # TODO: is pyc or single marshalled binary?
    if not runtimes:
        logger.error('No runtime found')