import streamlit as st import re ############################################# # 1) DEFINE YOUR MINIMAL TOKENIZER CLASSES # ############################################# class Token: """Represents a single token with a type, value, and position.""" def __init__(self, token_type, value, position=None): self.type = token_type self.value = value self.position = position def __repr__(self): return f"Token(type='{self.type}', value='{self.value}', position={self.position})" class Tokenizer: """A simple tokenizer for WORD, NUMBER, and SPACE.""" token_specifications = [ ('NUMBER', r'\d+'), ('WORD', r'[A-Za-z]+'), ('SPACE', r'\s+'), ('PUNCT', r'[^\w\s]'), # <--- Added punctuation pattern ] combined_pattern = '|'.join( f'(?P<{name}>{pattern})' for (name, pattern) in token_specifications ) def __init__(self, text): self.text = text self.regex = re.compile(self.combined_pattern) def tokenize(self): tokens = [] for match in self.regex.finditer(self.text): token_type = match.lastgroup token_value = match.group(token_type) position = match.start() # Ignore spaces if token_type == 'SPACE': continue tokens.append(Token(token_type, token_value, position)) return tokens ############################################# # 2) STREAMLIT APP LAYOUT & FUNCTIONALITY # ############################################# # Inject custom CSS for blinking boxes, unique coloring, etc. st.markdown( """ """, unsafe_allow_html=True ) st.title("My Tokenizer App") # Let the user enter text user_input = st.text_input("Enter your text:", "Hello world 123!") # When the user clicks the button, we run the tokenizer if st.button("Tokenize"): tokenizer = Tokenizer(user_input) tokens = tokenizer.tokenize() # Show the tokens in blinking boxes, each in a unique color # For a simple approach, define a list of colors we can cycle through color_list = ["blue", "red", "orange", "purple", "green", "teal", "magenta"] # We'll display them horizontally (inline) for i, tok in enumerate(tokens): color = color_list[i % len(color_list)] # pick color in a round-robin style # HTML for the blinking box with color, token value on top st.markdown( f"""