from string import ascii_letters, digits from rply import LexerGenerator VALID_CHARACTERS = ascii_letters+"_"+digits KEYWORD_TOKENS = [("KEYWORD_"+i.upper(), i) for i in [ "hello", "goodbye", "maybe", "do", "if", "because", "until", "define", "as", "variable", "return", "argument", "set", "skip", "else" ]] BUILTIN_TOKENS = [("BUILTIN", i) for i in [ "print", "input", "random" ]] DATA_TOKENS = [ ("DATA_STRING", r"\'.*?\'"), ("DATA_INT", r"\d+"), ("DATA_FLOAT", r"\d+(\.\d+)") ] SYMBOL_TOKENS = [ ("SYMBOL_OR", r"\\\/"), ("SYMBOL_AND", r"\/\\"), ("SYMBOL_SET", r"\-\>"), ("SYMBOL_TILDE", r"\~"), ("SYMBOL_LPARENS", r"\("), ("SYMBOL_RPARENS", r"\)"), ("SYMBOL_LBRACKET", r"\["), ("SYMBOL_RBRACKET", r"\]"), ("SYMBOL_LCURL", r"\{"), ("SYMBOL_RCURL", r"\}"), ("SYMBOL_PLUS", r"\+"), ("SYMBOL_MINUS", r"\-"), ("SYMBOL_TIMES", r"\*"), ("SYMBOL_DIVIDE", r"\/"), # ("SYMBOL_COMMA", r"\,"), # ("SYMBOL_COLON", r"\:"), ("SYMBOL_SEMICOLON", r"\;"), ("SYMBOL_PIPE", r"\|"), ("SYMBOL_QUOTE", r"\""), ("SYMBOL_LT", r"\<"), ("SYMBOL_GT", r"\>"), ("SYMBOL_EQUALS", r"\="), # ("SYMBOL_DOLLAR", r"\$") ] ALL_TOKENS = ( KEYWORD_TOKENS + BUILTIN_TOKENS + DATA_TOKENS + SYMBOL_TOKENS + [("ARG", r"\#\d+")] + [("ID", f"[{VALID_CHARACTERS}]+")] ) class Lexer(): def __init__(self): self.lexer = LexerGenerator() def _add_tokens(self): for token in ALL_TOKENS: self.lexer.add(*token) self.lexer.ignore(r"[\s\n]+|//.*\n") def get_lexer(self): self._add_tokens() return self.lexer.build()