78 lines
1.6 KiB
Python
78 lines
1.6 KiB
Python
from string import ascii_letters, digits
|
|
from rply import LexerGenerator
|
|
|
|
VALID_CHARACTERS = ascii_letters+"_"+digits
|
|
|
|
KEYWORD_TOKENS = [("KEYWORD_"+i.upper(), i) for i in [
|
|
"hello",
|
|
"goodbye",
|
|
"maybe",
|
|
"do",
|
|
"if",
|
|
# "because",
|
|
"until",
|
|
"define",
|
|
"as",
|
|
"variable",
|
|
"return",
|
|
"argument"
|
|
]]
|
|
|
|
BUILTIN_TOKENS = [("BUILTIN", i) for i in [
|
|
"print",
|
|
"input",
|
|
"random"
|
|
]]
|
|
|
|
DATA_TOKENS = [
|
|
("DATA_STRING", r"\'.*?\'"),
|
|
("DATA_INT", r"\d+"),
|
|
("DATA_FLOAT", r"\d+(\.\d+)")
|
|
]
|
|
|
|
SYMBOL_TOKENS = [
|
|
("SYMBOL_SET", r"\-\>"),
|
|
# ("SYMBOL_LPARENS", r"\("),
|
|
# ("SYMBOL_RPARENS", r"\)"),
|
|
("SYMBOL_LBRACKET", r"\["),
|
|
("SYMBOL_RBRACKET", r"\]"),
|
|
("SYMBOL_LCURL", r"\{"),
|
|
("SYMBOL_RCURL", r"\}"),
|
|
("SYMBOL_PLUS", r"\+"),
|
|
("SYMBOL_MINUS", r"\-"),
|
|
("SYMBOL_TIMES", r"\*"),
|
|
("SYMBOL_DIVIDE", r"\/"),
|
|
# ("SYMBOL_COMMA", r"\,"),
|
|
# ("SYMBOL_COLON", r"\:"),
|
|
("SYMBOL_SEMICOLON", r"\;"),
|
|
("SYMBOL_PIPE", r"\|"),
|
|
("SYMBOL_QUOTE", r"\""),
|
|
("SYMBOL_LT", r"\<"),
|
|
("SYMBOL_GT", r"\>"),
|
|
("SYMBOL_EQUALS", r"\="),
|
|
("SYMBOL_DOLLAR", r"\$")
|
|
]
|
|
|
|
ALL_TOKENS = (
|
|
KEYWORD_TOKENS +
|
|
BUILTIN_TOKENS +
|
|
DATA_TOKENS +
|
|
SYMBOL_TOKENS +
|
|
[("ARG", r"\#\d+")] +
|
|
[("ID", f"[{VALID_CHARACTERS}]+")]
|
|
)
|
|
|
|
|
|
class Lexer():
|
|
def __init__(self):
|
|
self.lexer = LexerGenerator()
|
|
|
|
def _add_tokens(self):
|
|
for token in ALL_TOKENS:
|
|
self.lexer.add(*token)
|
|
self.lexer.ignore(r"[\s\n]+|//.*\n")
|
|
|
|
def get_lexer(self):
|
|
self._add_tokens()
|
|
return self.lexer.build()
|