-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokenizer.py
executable file
·116 lines (94 loc) · 3.97 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python
import sys
class Token():
def __init__(self, value):
self.type_dict = {
self.isLambda: "lambda",
self.isIdentifier: "ID",
self.isNumber: "number",
self.isSymbol: "symbol",
self.isMeta: "meta"
}
if value == '$$':
self.type = 'EOF'
else:
self.type = self.getType(value)
#print('!' + value + '!' + self.type + '!')
self.value = value
def getType(self,token):
# loop through all functions, return the corresponding number for the first function that returns True
for func in [self.isLambda, self.isIdentifier, self.isNumber, self.isSymbol, self.isMeta]:
if func(token):
return self.type_dict[func]
return None
# <letter> --> a | b | ... | y | z | A | B | ... | Z | underscore
def isLetter(self,c):
return (ord('a') <= ord(c) <= ord('z')) or (ord('A') <= ord(c) <= ord('Z')) or c == '_'
# <digit> --> 0 | 1 | ... | 9
def isDigit(self,c):
return 48 <= ord(c) <= 57
# <number> --> <digit>+
def isNumber(self, num):
if num == "":
return False
for i in num:
if not self.isDigit(i):
return False
return True
# <identifier> --> <letter> (<letter> | <digit>)*
def isIdentifier(self,token):
if token == "":
return False
elif not self.isLetter(token[0]):
return False
for i in range(1,len(token)):
if not self.isDigit(token[i]) and not self.isLetter(token[i]):
return False
return True
def isLambda(self, t):
return t == 'lambda'
def isSymbol(self,token):
return token in ['(',')','+','-','*','/']
def isMeta(self, token):
return token[:2] == "//" or token[0] == '#'
class Tokenizer():
# create reverse list of lines so we can use pop() to remove lines from list and get them in order
def __init__(self, filename):
self.file = open(filename, "r")
self.lines = self.file.readlines()
self.lines = [line.rstrip('\n') for line in self.lines]
self.lines.reverse()
self.current_line = ""
self.current_index = 0 # keep track of the index after each token we read
def next_token(self):
while self.current_index >= len(self.current_line):
if len(self.lines) == 0:
return Token('$$')
else:
self.current_line = self.lines.pop()
self.current_index = 0
current_token = ""
for i in range(self.current_index, len(self.current_line)): # read from current index until break or end of line (at most)
char = self.current_line[i]
if char in [' ','\t']: # skip whitespace
self.current_index += 1
return self.next_token()
if i+1 == len(self.current_line): # if we're at the last index, make next_char empty
next_char = ""
else:
next_char = self.current_line[i+1] # otherwise, get value of next char
# if we reach comment, read until newline or just through entire rest of the list
if char+next_char == '//' or char == '#':
for j in range(i,len(self.current_line)):
if self.current_line[j] == '\n':
self.current_index = len(self.current_line)
return Token(current_token)
current_token += self.current_line[j]
self.current_index = len(self.current_line)
return Token(current_token)
current_token += char # add character to current token
if self.isSymbol(char) or self.isSymbol(next_char) or next_char in [' ','\n','\t']:
self.current_index = i+1
return Token(current_token)
def isSymbol(self,token):
return token in ['(',')','+','-','*','/']