#!/usr/bin/env python3 # Compare two python source files by tokens, ignoring whitespace (other than # indentation) and comments import sys import re class PyToken: INDENT = 1 OUTDENT = 2 ENDLINE = 3 WORD = 100 INT = 101 FLOAT = 102 STRING = 103 def __init__(self, type, n_line): self.type = type self.n_line = n_line def __str__(self): if self.type == PyToken.INDENT: return '' if self.type == PyToken.OUTDENT: return '' if self.type == PyToken.ENDLINE: return '' return str(self.type) def __eq__(self, other): return self.type == other.type class WordToken(PyToken): # We don't need to distinguish between keywords and other words, so # we just lump them together in a single token type... def __init__(self, word, n_line): super().__init__(PyToken.WORD, n_line) self.word = word def __str__(self): return self.word def __eq__(self, other): if not super().__eq__(other): return False return self.word == other.word class IntToken(PyToken): def __init__(self, value, n_line): super().__init__(PyToken.INT, n_line) try: self.value = int(value.replace('_', ''), 0) except ValueError: # Support Python 2.x octal literals if value.startswith('0'): self.value = int(value.replace('_', ''), 8) else: raise def __str__(self): return str(self.value) def __eq__(self, other): if not super().__eq__(other): return False return self.value == other.value class FloatToken(PyToken): def __init__(self, value, n_line): super().__init__(PyToken.FLOAT, n_line) self.value = float(value.replace('_', '')) def __str__(self): return str(self.value) def __eq__(self, other): if not super().__eq__(other): return False # TODO: Might need some fuzz return self.value == other.value class StringToken(PyToken): def __init__(self, prefix, content, n_line): super().__init__(PyToken.STRING, n_line) # Normalize prefix for comparison self.prefix = ''.join(sorted(prefix.lower())) # Normalize special characters for comparison self.content = content.replace("\\'", "'").replace("'", "\\'") \ .replace('\\"', '"').replace('\t', '\\t') \ .replace('\n', '\\n').replace('\r', '\\r') def __str__(self): return "{}'{}'".format(self.prefix, self.content) def __eq__(self, other): if not super().__eq__(other): return False return self.prefix == other.prefix and self.content == other.content RE_WHITESPACE = re.compile(r'\s+') RE_WORD = re.compile(r'[A-Za-z_][A-Za-z0-9_]*') RE_INT = re.compile(r'[0-9][0-9_]*|0[Xx][0-9A-Fa-f_]+|0[Bb][0-1_]+|0[Oo][0-7_]+') RE_FLOAT = re.compile(r'(([0-9][0-9_]*)?\.[0-9][0-9_]*|[0-9][0-9_]*\.)([eE][+-]?[0-9][0-9_]*)?') RE_START_STRING = re.compile(r'([rR][fFbB]?|[uU]|[fF][rR]?|[bB][rR]?)?(\'\'\'|\'|"""|")') # Note, tokens sharing a common prefix should be entered in order from # longest to shortest, so we don't mismatch a long token as a sequence # of shorter tokens SYMBOLIC_TOKENS = ( '<<=', '>>=', '**=', '//=', '...', '.', '+=', '-=', '*=', '@=', '/=', '%=', '&=', '|=', '^=', '<>', '<<', '<=', '<', '>>', '>=', '>', '!=', '==', '=', ',', ';', ':=', ':', '->', '~', '`', '+', '-', '**', '*', '@', '//', '/', '%', '&', '|', '^', '(', ')', '{', '}', '[', ']', ) def symbolic_token(line, n_line): for tok in SYMBOLIC_TOKENS: if line.startswith(tok): return PyToken(tok, n_line) return None def string_token(line, n_line, pysrc): match = RE_START_STRING.match(line) if not match: return None # Look for the end of the string prefix = match.group(1) if prefix is None: prefix = '' quotes = match.group(2) start = len(prefix) + len(quotes) content = '' while True: end = line.find(quotes, start) if end > 0 and line[end - 1] == '\\': content += line[start:end + 1] start = end + 1 continue elif end >= 0: content += line[start:end] break # Read in a new line content += line[start:] line = pysrc.readline() n_line += 1 start = 0 if not line: raise RuntimeError('Reached EOF while looking for {}'.format(repr(quotes))) token = StringToken(prefix, content, n_line) token.rem_line = line[end + len(quotes):] token.end_line = n_line return token def read_tokens(pysrc): indent_stack = [0] context_stack = [] n_line = 0 while True: line = pysrc.readline() n_line += 1 if not line: break if not line.strip() or line.lstrip().startswith('#'): continue # Look for indentation changes if len(context_stack) == 0: indent = len(line) - len(line.lstrip()) if indent > indent_stack[-1]: indent_stack.append(indent) yield PyToken(PyToken.INDENT, n_line) while indent < indent_stack[-1]: indent_stack.pop() yield PyToken(PyToken.OUTDENT, n_line) if indent != indent_stack[-1]: raise RuntimeError('Incorrect indentation on line {}'.format(n_line)) while True: line = line.lstrip() if not line: break if line[0] == '#': # The rest of this line is a comment break token = symbolic_token(line, n_line) if token: if token.type in {'(', '{', '['}: context_stack.append(token.type) elif token.type == ')': if len(context_stack) == 0 or context_stack[-1] != '(': raise RuntimeError('Mismatched token at {} on line {}'.format(line, n_line)) context_stack.pop() elif token.type == '}': if len(context_stack) == 0 or context_stack[-1] != '{': raise RuntimeError('Mismatched token at {} on line {}'.format(line, n_line)) context_stack.pop() elif token.type == ']': if len(context_stack) == 0 or context_stack[-1] != '[': raise RuntimeError('Mismatched token at {} on line {}'.format(line, n_line)) context_stack.pop() yield token line = line[len(token.type):] continue match = RE_FLOAT.match(line) if match: yield FloatToken(match.group(), n_line) line = line[match.end():] continue match = RE_INT.match(line) if match: yield IntToken(match.group(), n_line) line = line[match.end():] continue token = string_token(line, n_line, pysrc) if token: line = token.rem_line n_line = token.end_line yield token continue match = RE_WORD.match(line) if match: yield WordToken(match.group(), n_line) line = line[match.end():] continue raise RuntimeError('Error: Unrecognized tokens: "{}" at line {}'.format(line, n_line)) if len(context_stack) == 0: yield PyToken(PyToken.ENDLINE, n_line) if __name__ == '__main__': if '--help' in sys.argv: print('Usage: token_dump .py') sys.exit(0) if len(sys.argv) >= 2: pysrc = open(sys.argv[1], 'r') else: pysrc = sys.stdin for tok in read_tokens(pysrc): if tok.type in {PyToken.ENDLINE, PyToken.INDENT, PyToken.OUTDENT}: print(tok) else: print(tok, end=' ') pysrc.close()