-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtokenizer3.py
More file actions
101 lines (75 loc) · 2.41 KB
/
tokenizer3.py
File metadata and controls
101 lines (75 loc) · 2.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from typing import List
from enum import Enum, unique
import re
WHITESPACE_CHARS = r'\s'
@unique
# Priority-ordered tokens list
class TokenType(Enum):
Null = -1
String = 0
QuotedString = 1
Whitespace = 2
OpenComment = '[!--'
CloseComment = '--]'
OpenTripleBracket = '[[['
CloseTripleBracket = ']]]'
OpenDoubleBracket = '[['
CloseDoubleBracket = ']]'
OpenSingleBracket = '['
CloseSingleBracket = ']'
Quote = '"'
Blockquote = '>'
DoubleAt = '@@'
DoubleHash = '##'
DoublePipe = '||'
DoubleSup = '^^'
DoubleSub = ',,'
OpenHTMLLiteral = '@<'
CloseHTMLLiteral = '>@'
OpenInlineCode = '{{'
CloseInlineCode = '}}'
HrBeginning = '----'
ClearFloatBeginning = '~~~~'
DoubleDash = '--'
DoubleAsterisk = '**'
DoubleSlash = '//'
DoubleUnderline = '__'
Equals = '='
Pipe = '|'
Asterisk = '*'
Hash = '#'
Plus = '+'
Newline = '\n'
Slash = '/'
Backslash = '\\'
Tilde = '~'
Underline = '_'
class Token:
def __init__(self, type: TokenType, start: int, end: int, source: str):
self.type = type
self.start = start
self.end = end
self.source = source
@property
def raw(self):
return self.source[self.start:self.end]
def __repr__(self):
return '<Token type=%s, raw=%s>' % (self.type.name, repr(self.raw))
class Tokenizer:
def __init__(self):
self.rules = {rule.value: rule for rule in TokenType if not isinstance(rule.value, int)}
self.token_regex = re.compile('|'.join(re.escape(rule.value) for rule in TokenType if not isinstance(rule.value, int)) + f'|[{WHITESPACE_CHARS}]+')
def tokenize(self, source: str) -> List[Token]:
tokens: List[Token] = []
last_end = 0
for match in self.token_regex.finditer(source):
start, end = match.span()
if start > last_end:
tokens.append(Token(TokenType.String, last_end, start, source))
group = match.group()
token_type = self.rules.get(group, TokenType.Whitespace)
tokens.append(Token(token_type, start, end, source))
last_end = end
if last_end < len(source):
tokens.append(Token(TokenType.String, last_end, len(source), source))
return tokens