#! /usr/bin/env python # tokenize.py # # Parses a C/C++/C#/D/Java/Pawn/whatever file in an array of # tuples (string, type) # # punctuator lookup table punc_table = [ [ '!', 25, 26, '!' ], # 0: '!' [ '#', 24, 35, '#' ], # 1: '#' [ '$', 23, 0, '$' ], # 2: '$' [ '%', 22, 36, '%' ], # 3: '%' [ '&', 21, 41, '&' ], # 4: '&' [ '(', 20, 0, '(' ], # 5: '(' [ ')', 19, 0, ')' ], # 6: ')' [ '*', 18, 43, '*' ], # 7: '*' [ '+', 17, 44, '+' ], # 8: '+' [ ',', 16, 0, ',' ], # 9: ',' [ '-', 15, 46, '-' ], # 10: '-' [ '.', 14, 50, '.' ], # 11: '.' [ '/', 13, 53, '/' ], # 12: '/' [ ':', 12, 54, ':' ], # 13: ':' [ ';', 11, 0, ';' ], # 14: ';' [ '<', 10, 56, '<' ], # 15: '<' [ '=', 9, 63, '=' ], # 16: '=' [ '>', 8, 65, '>' ], # 17: '>' [ '?', 7, 0, '?' ], # 18: '?' [ '[', 6, 70, '[' ], # 19: '[' [ ']', 5, 0, ']' ], # 20: ']' [ '^', 4, 71, '^' ], # 21: '^' [ '{', 3, 0, '{' ], # 22: '{' [ '|', 2, 72, '|' ], # 23: '|' [ '}', 1, 0, '}' ], # 24: '}' [ '~', 0, 74, '~' ], # 25: '~' [ '<', 3, 30, '!<' ], # 26: '!<' [ '=', 2, 33, '!=' ], # 27: '!=' [ '>', 1, 34, '!>' ], # 28: '!>' [ '~', 0, 0, '!~' ], # 29: '!~' [ '=', 1, 0, '!<=' ], # 30: '!<=' [ '>', 0, 32, '!<>' ], # 31: '!<>' [ '=', 0, 0, '!<>='], # 32: '!<>=' [ '=', 0, 0, '!==' ], # 33: '!==' [ '=', 0, 0, '!>=' ], # 34: '!>=' [ '#', 0, 0, '##' ], # 35: '##' [ ':', 2, 39, '%:' ], # 36: '%:' [ '=', 1, 0, '%=' ], # 37: '%=' [ '>', 0, 0, '%>' ], # 38: '%>' [ '%', 0, 40, None ], # 39: '%:%' [ ':', 0, 0, '%:%:'], # 40: '%:%:' [ '&', 1, 0, '&&' ], # 41: '&&' [ '=', 0, 0, '&=' ], # 42: '&=' [ '=', 0, 0, '*=' ], # 43: '*=' [ '+', 1, 0, '++' ], # 44: '++' [ '=', 0, 0, '+=' ], # 45: '+=' [ '-', 2, 0, '--' ], # 46: '--' [ '=', 1, 0, '-=' ], # 47: '-=' [ '>', 0, 49, '->' ], # 48: '->' [ '*', 0, 0, '->*' ], # 49: '->*' [ '*', 1, 0, '.*' ], # 50: '.*' [ '.', 0, 52, '..' ], # 51: '..' [ '.', 0, 0, '...' ], # 52: '...' [ '=', 0, 0, '/=' ], # 53: '/=' [ ':', 1, 0, '::' ], # 54: '::' [ '>', 0, 0, ':>' ], # 55: ':>' [ '%', 4, 0, '<%' ], # 56: '<%' [ ':', 3, 0, '<:' ], # 57: '<:' [ '<', 2, 61, '<<' ], # 58: '<<' [ '=', 1, 0, '<=' ], # 59: '<=' [ '>', 0, 62, '<>' ], # 60: '<>' [ '=', 0, 0, '<<=' ], # 61: '<<=' [ '=', 0, 0, '<>=' ], # 62: '<>=' [ '=', 0, 64, '==' ], # 63: '==' [ '=', 0, 0, '===' ], # 64: '===' [ '=', 1, 0, '>=' ], # 65: '>=' [ '>', 0, 67, '>>' ], # 66: '>>' [ '=', 1, 0, '>>=' ], # 67: '>>=' [ '>', 0, 69, '>>>' ], # 68: '>>>' [ '=', 0, 0, '>>>='], # 69: '>>>=' [ ']', 0, 0, '[]' ], # 70: '[]' [ '=', 0, 0, '^=' ], # 71: '^=' [ '=', 1, 0, '|=' ], # 72: '|=' [ '|', 0, 0, '||' ], # 73: '||' [ '=', 1, 0, '~=' ], # 74: '~=' [ '~', 0, 0, '~~' ], # 75: '~~' ] # # Token types: # 0 = newline # 1 = punctuator # 2 = integer # 3 = float # 4 = string # 5 = identifier # class Tokenizer: def __init__(self): self.tokens = [] self.text = '' self.text_idx = 0 def tokenize_text(self, in_text): self.tokens = [] self.text = in_text self.text_idx = 0 print(in_text) try: while self.text_idx < len(self.text): if self.parse_whitespace(): continue elif self.text[self.text_idx] == '\\' and self.text[self.text_idx + 1] == '\n': self.text_idx += 2 continue elif self.parse_comment(): continue elif self.parse_number(): continue elif self.parse_identifier(): continue elif self.parse_string(): continue elif self.parse_punctuator(): continue else: print("confused: %s" % self.text[self.text_idx:]) break except: print("bombed") raise def parse_whitespace(self): start_idx = self.text_idx hit_newline = False while self.text_idx < len(self.text): if self.text[self.text_idx] in '\n\r': hit_newline = True elif not self.text[self.text_idx] in ' \t': break self.text_idx += 1 if hit_newline: self.tokens.append(('\n', 0)) return start_idx != self.text_idx def parse_comment(self): if not self.text[self.text_idx] == '/' or not self.text[self.text_idx + 1] in '/*': return False if self.text[self.text_idx + 1] == '/': while self.text_idx < len(self.text): if self.text[self.text_idx] in '\n\r': break self.text_idx += 1 else: while self.text_idx < len(self.text) - 1: if self.text[self.text_idx] == '*' and self.text[self.text_idx + 1] == '/': self.text_idx += 2 break self.text_idx += 1 return True def parse_identifier(self): if not self.text[self.text_idx].upper() in '@_ABCDEFGHIJKLMNOPQRSTUVWXYZ': return False start_idx = self.text_idx while self.text_idx < len(self.text) and \ self.text[self.text_idx].upper() in '@_ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890': self.text_idx += 1 self.tokens.append((self.text[start_idx : self.text_idx], 5)) return True def parse_string(self): starter = 0 start_ch = self.text[self.text_idx] if start_ch == 'L': starter = 1 start_ch = self.text[self.text_idx + 1] if not start_ch in '"\'': return False start_idx = self.text_idx self.text_idx += starter + 1 escaped = False while self.text_idx < len(self.text): if escaped: escaped = False else: if self.text[self.text_idx] == '\\': escaped = True elif self.text[self.text_idx] == start_ch: self.text_idx += 1 break self.text_idx += 1 self.tokens.append((self.text[start_idx : self.text_idx], 4)) return True # Checks for punctuators # Returns whether a punctuator was consumed (True or False) def parse_punctuator(self): tab_idx = 0 punc_len = 0 saved_punc = None while 1: pte = punc_table[tab_idx] if pte[0] == self.text[self.text_idx]: if pte[3] is not None: saved_punc = pte[3] self.text_idx += 1 tab_idx = pte[2] if tab_idx == 0: break elif pte[1] == 0: break else: tab_idx += 1 if saved_punc is not None: self.tokens.append((saved_punc, 1)) return True return False def parse_number(self): # A number must start with a digit or a dot followed by a digit ch = self.text[self.text_idx] if not ch.isdigit() and (ch != '.' or not self.text[self.text_idx + 1].isdigit()): return False token_type = 2 # integer if ch == '.': token_type = 3 # float did_hex = False start_idx = self.text_idx # Check for Hex, Octal, or Binary # Note that only D and Pawn support binary, but who cares? # if ch == '0': self.text_idx += 1 ch = self.text[self.text_idx].upper() if ch == 'X': # hex did_hex = True self.text_idx += 1 while self.text[self.text_idx] in '_0123456789abcdefABCDEF': self.text_idx += 1 elif ch == 'B': # binary self.text_idx += 1 while self.text[self.text_idx] in '_01': self.text_idx += 1 elif ch >= '0' and ch <= 7: # octal (but allow decimal) self.text_idx += 1 while self.text[self.text_idx] in '_0123456789': self.text_idx += 1 else: # either just 0 or 0.1 or 0UL, etc pass else: # Regular int or float while self.text[self.text_idx] in '_0123456789': self.text_idx += 1 # Check if we stopped on a decimal point if self.text[self.text_idx] == '.': self.text_idx += 1 token_type = 3 # float if did_hex: while self.text[self.text_idx] in '_0123456789abcdefABCDEF': self.text_idx += 1 else: while self.text[self.text_idx] in '_0123456789': self.text_idx += 1 # Check exponent # Valid exponents per language (not that it matters): # C/C++/D/Java: eEpP # C#/Pawn: eE if self.text[self.text_idx] in 'eEpP': token_type = 3 # float self.text_idx += 1 if self.text[self.text_idx] in '+-': self.text_idx += 1 while self.text[self.text_idx] in '_0123456789': self.text_idx += 1 # Check the suffixes # Valid suffixes per language (not that it matters): # Integer Float # C/C++: uUlL lLfF # C#: uUlL fFdDMm # D: uUL ifFL # Java: lL fFdD # Pawn: (none) (none) # # Note that i, f, d, and m only appear in floats. while 1: if self.text[self.text_idx] in 'tTfFdDmM': token_type = 3 # float elif not self.text[self.text_idx] in 'lLuU': break self.text_idx += 1 self.tokens.append((self.text[start_idx : self.text_idx], token_type)) return True text = """ 1.23+4-3*16%2 *sin(1.e-3 + .5p32) "hello" and "hello\\"there" 123 // some comment a = b + c; #define abc \\ 5 d = 5 /* hello */ + 3; """ t = Tokenizer() t.tokenize_text(text) print(t.tokens)