feat: file format detection, single file mode

This commit is contained in:
2025-03-25 23:47:21 +08:00
parent 48cab893c9
commit e7397945d3
3 changed files with 195 additions and 37 deletions

View File

@@ -6,7 +6,7 @@ This project aims to convert armored data back to bytecode assembly and (experim
> [!WARNING]
>
> **Disassembly results are accurate, but decompiled code can be incomplete and incorrect.** Bytecode has changed a lot in recent Python versions, while pycdc has limited support for bytecode in newer versions.
> **Disassembly results are accurate, but decompiled code can be incomplete and incorrect.** [See issue #3](https://github.com/Lil-House/Pyarmor-Static-Unpack-1shot/issues/3)
## Features
@@ -56,7 +56,7 @@ All files generated from this tool have a `.1shot.` in file names. If you want t
Note:
- Subdirectories called `__pycache__` or `site-packages` will not be touched, and symbolic links will not be followed, to avoid repeat or forever loop and save time. If you really need them, run the script later in these directories and specify the runtime.
- Subdirectories will not be touched if the folder name is exactly `__pycache__` or `site-packages` or it directly contains a file named `.no1shot`, and symbolic links will not be followed, to avoid repeat or forever loop and save time. If you really need them, run the script later in these directories and specify the runtime.
- Archives, executables generated by PyInstaller and so on, must be unpacked by other tools before decrypting, or you will encounter undefined behavior.
## Feedback
@@ -66,7 +66,6 @@ Feel free to open an issue if you have any questions, suggestions, or problems.
## Todo (PR Welcome!)
- [ ] Multi-platform pyarmor_runtime executable
- [ ] Accept more input forms
- [ ] Support more obfuscating options
- [ ] Use asyncio for concurrency
- [ ] Pyarmor 7 and before (Later or never.)

157
helpers/detect.py Normal file
View File

@@ -0,0 +1,157 @@
import logging
import os
from typing import List, Tuple
def ascii_ratio(data: bytes) -> float:
return sum(32 <= c < 127 for c in data) / len(data)
def source_as_file(file_path: str) -> List[bytes] | None:
try:
with open(file_path, 'r') as f:
co = compile(f.read(), '<str>', 'exec')
data = [i for i in co.co_consts if type(i) is bytes
and i.startswith(b'PY00') and len(i) > 64]
return data
except:
return None
def source_as_lines(file_path: str) -> List[bytes] | None:
data = []
try:
with open(file_path, 'r') as f:
for line in f:
try:
co = compile(line, '<str>', 'exec')
data.extend([i for i in co.co_consts if type(i) is bytes
and i.startswith(b'PY00') and len(i) > 64])
except:
# ignore not compilable lines
pass
except:
return None
return data
def find_data_from_bytes(data: bytes, max_count=-1) -> List[bytes]:
result = []
idx = 0
while len(result) != max_count:
idx = data.find(b'PY00')
if idx == -1:
break
data = data[idx:]
if len(data) < 64:
break
header_len = int.from_bytes(data[28:32], 'little')
body_len = int.from_bytes(data[32:36], 'little')
if header_len > 256 or body_len > 0xFFFFF or header_len + body_len > len(data):
# compressed or coincident, skip
data = data[5:]
continue
result.append(data[:header_len + body_len])
# maybe followed by data for other Python versions from the same file,
# we do not extract them
followed_by_another_equivalent = int.from_bytes(
data[56:60], 'little') != 0
data = data[header_len + body_len:]
while followed_by_another_equivalent \
and data.startswith(b'PY00') \
and len(data) >= 64:
header_len = int.from_bytes(data[28:32], 'little')
body_len = int.from_bytes(data[32:36], 'little')
followed_by_another_equivalent = int.from_bytes(
data[56:60], 'little') != 0
data = data[header_len + body_len:]
return result
def nuitka_package(head: bytes, relative_path: str) -> None | List[Tuple[str, bytes]]:
first_occurrence = head.find(b'PY00')
if first_occurrence == -1:
return None
last_dot_bytecode = head.rfind(b'.bytecode\x00', 0, first_occurrence)
if last_dot_bytecode == -1:
return None
length = int.from_bytes(
head[last_dot_bytecode-4:last_dot_bytecode], 'little')
end = last_dot_bytecode + length
cur = last_dot_bytecode
result = []
while cur < end:
module_name_len = head.find(b'\x00', cur, end) - cur
module_name = head[cur:cur + module_name_len].decode('utf-8')
cur += module_name_len + 1
module_len = int.from_bytes(head[cur:cur + 4], 'little')
cur += 4
module_data = find_data_from_bytes(head[cur:cur + module_len], 1)
if module_data:
result.append((os.path.join(relative_path.rstrip(
'/\\') + '.1shot.ext', module_name), module_data[0]))
cur += module_len
if result:
logger = logging.getLogger('detect')
logger.info(f'Found data in Nuitka package: {relative_path}')
return result
return None
def detect_process(file_path: str, relative_path: str) -> None | List[Tuple[str, bytes]]:
'''
Returns a list of (relative_path, bytes_raw) tuples, or None.
Do not raise exceptions.
'''
logger = logging.getLogger('detect')
try:
with open(file_path, 'rb') as f:
head = f.read(16 * 1024 * 1024)
except:
logger.error(f'Failed to read file: {relative_path}')
return None
if b'__pyarmor__' not in head:
# no need to dig deeper
return None
if ascii_ratio(head[:2048]) >= 0.9:
# the whole file may not be compiled, but we can still try some lines;
# None means failure (then we make another try),
# empty list means success but no data found (then we skip this file)
result = source_as_file(file_path)
if result is None:
result = source_as_lines(file_path)
if result is None:
return None
match len(result):
case 0:
return None
case 1:
logger.info(f'Found data in source: {relative_path}')
return [(relative_path, result[0])]
case _:
logger.info(f'Found data in source: {relative_path}')
return [(f'{relative_path}__{i}', result[i]) for i in range(len(result))]
# binary file
# ignore data after 16MB, before we have a reason to read more
if b'Error, corrupted constants object' in head:
# an interesting special case: packer put armored data in a Nuitka package
# we can know the exact module names, instead of adding boring __0, __1, ...
return nuitka_package(head, relative_path)
result = find_data_from_bytes(head)
match len(result):
case 0:
return None
case 1:
logger.info(f'Found data in binary: {relative_path}')
return [(relative_path, result[0])]
case _:
logger.info(f'Found data in binary: {relative_path}')
return [(f'{relative_path}__{i}', result[i]) for i in range(len(result))]

View File

@@ -5,6 +5,7 @@ import os
import subprocess
from typing import Dict, List, Tuple
from detect import detect_process
from runtime import RuntimeInfo
@@ -157,6 +158,7 @@ def main():
if args.runtime:
specified_runtime = RuntimeInfo(args.runtime)
print(specified_runtime)
runtimes = {specified_runtime.serial_number: specified_runtime}
else:
specified_runtime = None
@@ -167,23 +169,50 @@ def main():
if args.output_dir and not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
if os.path.isfile(args.directory):
if specified_runtime is None:
logger.error('Please specify `pyarmor_runtime` file by `-r` if input is a file')
return
logger.info('Single file mode')
result = detect_process(args.directory, args.directory)
if result is None:
logger.error('No armored data found')
return
sequences.extend(result)
decrypt_process(runtimes, sequences, args)
return # single file mode ends here
dir_path: str
dirs: List[str]
files: List[str]
for dir_path, dirs, files in os.walk(args.directory, followlinks=False):
if '.no1shot' in files:
logger.info(f'Skipping {dir_path} because of `.no1shot`')
dirs.clear()
files.clear()
continue
for d in ['__pycache__', 'site-packages']:
if d in dirs:
dirs.remove(d)
for file_name in files:
if '.1shot.' in file_name:
continue
handled = False
file_path = os.path.join(dir_path, file_name)
relative_path = os.path.relpath(file_path, args.directory)
if file_name.endswith('.pyz'):
with open(file_path, 'rb') as f:
head = f.read(16 * 1024 * 1024)
if b'PY00' in head \
and (not os.path.exists(file_path + '_extracted')
or len(os.listdir(file_path + '_extracted')) == 0):
logger.error(
f'A PYZ file containing armored data is detected, but the PYZ file has not been extracted by other tools. This error is not a problem with this tool. If the folder is extracted by Pyinstxtractor, please read the output information of Pyinstxtractor carefully. ({relative_path})')
continue
# is pyarmor_runtime?
if not handled \
and specified_runtime is None \
if specified_runtime is None \
and file_name.startswith('pyarmor_runtime') \
and file_name.endswith(('.pyd', '.so', '.dylib')):
try:
@@ -192,40 +221,13 @@ def main():
logger.info(
f'Found new runtime: {new_runtime.serial_number} ({file_path})')
print(new_runtime)
handled = True
continue
except:
pass
try:
with open(file_path, 'rb') as f:
beacon = f.read(16 * 1024 * 1024)
except:
logger.error(f'Failed to read file: {relative_path}')
continue
# is UTF-8 source?
# TODO: only support natural one line now
if not handled and b'__pyarmor__(__name__, __file__,' in beacon:
try:
with open(file_path, 'r') as f:
for line in f:
if line.startswith('__pyarmor__(') and line.rstrip().endswith(')'):
co = compile(line, '<str>', 'exec')
bytes_raw = co.co_consts[0]
assert type(bytes_raw) is bytes
assert bytes_raw.startswith(b'PY')
assert len(bytes_raw) > 64
break
logger.info(f'Found data in source: {relative_path}')
# FIXME: bytes_raw can be kept from last iteration
sequences.append((relative_path, bytes_raw))
del bytes_raw
handled = True
except Exception as e:
logger.error(f'Assume source, but {e} ({file_path})')
# TODO: is Nuitka package?
# TODO: is pyc or single marshalled binary?
result = detect_process(file_path, relative_path)
if result is not None:
sequences.extend(result)
if not runtimes:
logger.error('No runtime found')