271 lines
8.1 KiB
Python
271 lines
8.1 KiB
Python
#!/usr/bin/env python3
|
|
|
|
# Compare two python source files by tokens, ignoring whitespace (other than
|
|
# indentation) and comments
|
|
|
|
import sys
|
|
import re
|
|
|
|
|
|
class PyToken:
|
|
INDENT = 1
|
|
OUTDENT = 2
|
|
ENDLINE = 3
|
|
WORD = 100
|
|
INT = 101
|
|
FLOAT = 102
|
|
STRING = 103
|
|
|
|
def __init__(self, type, n_line):
|
|
self.type = type
|
|
self.n_line = n_line
|
|
|
|
def __str__(self):
|
|
if self.type == PyToken.INDENT:
|
|
return '<INDENT>'
|
|
if self.type == PyToken.OUTDENT:
|
|
return '<OUTDENT>'
|
|
if self.type == PyToken.ENDLINE:
|
|
return '<EOL>'
|
|
return str(self.type)
|
|
|
|
def __eq__(self, other):
|
|
return self.type == other.type
|
|
|
|
|
|
class WordToken(PyToken):
|
|
# We don't need to distinguish between keywords and other words, so
|
|
# we just lump them together in a single token type...
|
|
def __init__(self, word, n_line):
|
|
super().__init__(PyToken.WORD, n_line)
|
|
self.word = word
|
|
|
|
def __str__(self):
|
|
return self.word
|
|
|
|
def __eq__(self, other):
|
|
if not super().__eq__(other):
|
|
return False
|
|
return self.word == other.word
|
|
|
|
|
|
class IntToken(PyToken):
|
|
def __init__(self, value, n_line):
|
|
super().__init__(PyToken.INT, n_line)
|
|
try:
|
|
self.value = int(value.replace('_', ''), 0)
|
|
except ValueError:
|
|
# Support Python 2.x octal literals
|
|
if value.startswith('0'):
|
|
self.value = int(value.replace('_', ''), 8)
|
|
else:
|
|
raise
|
|
|
|
def __str__(self):
|
|
return str(self.value)
|
|
|
|
def __eq__(self, other):
|
|
if not super().__eq__(other):
|
|
return False
|
|
return self.value == other.value
|
|
|
|
|
|
class FloatToken(PyToken):
|
|
def __init__(self, value, n_line):
|
|
super().__init__(PyToken.FLOAT, n_line)
|
|
self.value = float(value.replace('_', ''))
|
|
|
|
def __str__(self):
|
|
return str(self.value)
|
|
|
|
def __eq__(self, other):
|
|
if not super().__eq__(other):
|
|
return False
|
|
# TODO: Might need some fuzz
|
|
return self.value == other.value
|
|
|
|
|
|
class StringToken(PyToken):
|
|
def __init__(self, prefix, content, n_line):
|
|
super().__init__(PyToken.STRING, n_line)
|
|
|
|
# Normalize prefix for comparison
|
|
self.prefix = ''.join(sorted(prefix.lower()))
|
|
|
|
# Normalize special characters for comparison
|
|
self.content = content.replace("\\'", "'").replace("'", "\\'") \
|
|
.replace('\\"', '"').replace('\t', '\\t') \
|
|
.replace('\n', '\\n').replace('\r', '\\r')
|
|
|
|
def __str__(self):
|
|
return "{}'{}'".format(self.prefix, self.content)
|
|
|
|
def __eq__(self, other):
|
|
if not super().__eq__(other):
|
|
return False
|
|
return self.prefix == other.prefix and self.content == other.content
|
|
|
|
|
|
RE_WHITESPACE = re.compile(r'\s+')
|
|
RE_WORD = re.compile(r'[A-Za-z_][A-Za-z0-9_]*')
|
|
RE_INT = re.compile(r'[0-9][0-9_]*|0[Xx][0-9A-Fa-f_]+|0[Bb][0-1_]+|0[Oo][0-7_]+')
|
|
RE_FLOAT = re.compile(r'(([0-9][0-9_]*)?\.[0-9][0-9_]*|[0-9][0-9_]*\.)([eE][+-]?[0-9][0-9_]*)?')
|
|
RE_START_STRING = re.compile(r'([rR][fFbB]?|[uU]|[fF][rR]?|[bB][rR]?)?(\'\'\'|\'|"""|")')
|
|
|
|
# Note, tokens sharing a common prefix should be entered in order from
|
|
# longest to shortest, so we don't mismatch a long token as a sequence
|
|
# of shorter tokens
|
|
SYMBOLIC_TOKENS = (
|
|
'<<=', '>>=', '**=', '//=', '...', '.',
|
|
'+=', '-=', '*=', '@=', '/=', '%=', '&=', '|=', '^=',
|
|
'<>', '<<', '<=', '<', '>>', '>=', '>', '!=', '==', '=',
|
|
',', ';', ':=', ':', '->', '~', '`',
|
|
'+', '-', '**', '*', '@', '//', '/', '%', '&', '|', '^',
|
|
'(', ')', '{', '}', '[', ']',
|
|
)
|
|
def symbolic_token(line, n_line):
|
|
for tok in SYMBOLIC_TOKENS:
|
|
if line.startswith(tok):
|
|
return PyToken(tok, n_line)
|
|
return None
|
|
|
|
|
|
def string_token(line, n_line, pysrc):
|
|
match = RE_START_STRING.match(line)
|
|
if not match:
|
|
return None
|
|
|
|
# Look for the end of the string
|
|
prefix = match.group(1)
|
|
if prefix is None:
|
|
prefix = ''
|
|
quotes = match.group(2)
|
|
start = len(prefix) + len(quotes)
|
|
content = ''
|
|
while True:
|
|
end = line.find(quotes, start)
|
|
if end > 0 and line[end - 1] == '\\':
|
|
content += line[start:end + 1]
|
|
start = end + 1
|
|
continue
|
|
elif end >= 0:
|
|
content += line[start:end]
|
|
break
|
|
|
|
# Read in a new line
|
|
content += line[start:]
|
|
line = pysrc.readline()
|
|
n_line += 1
|
|
start = 0
|
|
if not line:
|
|
raise RuntimeError('Reached EOF while looking for {}'.format(repr(quotes)))
|
|
|
|
token = StringToken(prefix, content, n_line)
|
|
token.rem_line = line[end + len(quotes):]
|
|
token.end_line = n_line
|
|
return token
|
|
|
|
|
|
def read_tokens(pysrc):
|
|
indent_stack = [0]
|
|
context_stack = []
|
|
n_line = 0
|
|
|
|
while True:
|
|
line = pysrc.readline()
|
|
n_line += 1
|
|
if not line:
|
|
break
|
|
|
|
if not line.strip() or line.lstrip().startswith('#'):
|
|
continue
|
|
|
|
# Look for indentation changes
|
|
if len(context_stack) == 0:
|
|
indent = len(line) - len(line.lstrip())
|
|
if indent > indent_stack[-1]:
|
|
indent_stack.append(indent)
|
|
yield PyToken(PyToken.INDENT, n_line)
|
|
while indent < indent_stack[-1]:
|
|
indent_stack.pop()
|
|
yield PyToken(PyToken.OUTDENT, n_line)
|
|
if indent != indent_stack[-1]:
|
|
raise RuntimeError('Incorrect indentation on line {}'.format(n_line))
|
|
|
|
while True:
|
|
line = line.lstrip()
|
|
if not line:
|
|
break
|
|
if line[0] == '#':
|
|
# The rest of this line is a comment
|
|
break
|
|
|
|
token = symbolic_token(line, n_line)
|
|
if token:
|
|
if token.type in {'(', '{', '['}:
|
|
context_stack.append(token.type)
|
|
elif token.type == ')':
|
|
if len(context_stack) == 0 or context_stack[-1] != '(':
|
|
raise RuntimeError('Mismatched token at {} on line {}'.format(line, n_line))
|
|
context_stack.pop()
|
|
elif token.type == '}':
|
|
if len(context_stack) == 0 or context_stack[-1] != '{':
|
|
raise RuntimeError('Mismatched token at {} on line {}'.format(line, n_line))
|
|
context_stack.pop()
|
|
elif token.type == ']':
|
|
if len(context_stack) == 0 or context_stack[-1] != '[':
|
|
raise RuntimeError('Mismatched token at {} on line {}'.format(line, n_line))
|
|
context_stack.pop()
|
|
yield token
|
|
line = line[len(token.type):]
|
|
continue
|
|
|
|
match = RE_FLOAT.match(line)
|
|
if match:
|
|
yield FloatToken(match.group(), n_line)
|
|
line = line[match.end():]
|
|
continue
|
|
|
|
match = RE_INT.match(line)
|
|
if match:
|
|
yield IntToken(match.group(), n_line)
|
|
line = line[match.end():]
|
|
continue
|
|
|
|
token = string_token(line, n_line, pysrc)
|
|
if token:
|
|
line = token.rem_line
|
|
n_line = token.end_line
|
|
yield token
|
|
continue
|
|
|
|
match = RE_WORD.match(line)
|
|
if match:
|
|
yield WordToken(match.group(), n_line)
|
|
line = line[match.end():]
|
|
continue
|
|
|
|
raise RuntimeError('Error: Unrecognized tokens: "{}" at line {}'.format(line, n_line))
|
|
|
|
if len(context_stack) == 0:
|
|
yield PyToken(PyToken.ENDLINE, n_line)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
if '--help' in sys.argv:
|
|
print('Usage: token_dump <file>.py')
|
|
sys.exit(0)
|
|
|
|
if len(sys.argv) >= 2:
|
|
pysrc = open(sys.argv[1], 'r')
|
|
else:
|
|
pysrc = sys.stdin
|
|
|
|
for tok in read_tokens(pysrc):
|
|
if tok.type in {PyToken.ENDLINE, PyToken.INDENT, PyToken.OUTDENT}:
|
|
print(tok)
|
|
else:
|
|
print(tok, end=' ')
|
|
|
|
pysrc.close()
|