File size: 3,608 Bytes
1476963
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8fe0ddf
1476963
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import streamlit as st
import re

#############################################
# 1) DEFINE YOUR MINIMAL TOKENIZER CLASSES  #
#############################################
class Token:
    """Represents a single token with a type, value, and position."""
    def __init__(self, token_type, value, position=None):
        self.type = token_type
        self.value = value
        self.position = position
    
    def __repr__(self):
        return f"Token(type='{self.type}', value='{self.value}', position={self.position})"


class Tokenizer:
    """A simple tokenizer for WORD, NUMBER, and SPACE."""
    
    token_specifications = [
        ('NUMBER', r'\d+'),
        ('WORD',   r'[A-Za-z]+'),
        ('SPACE',  r'\s+'),
        ('PUNCT',  r'[^\w\s]'),  # <--- Added punctuation pattern
    ]
    
    combined_pattern = '|'.join(
        f'(?P<{name}>{pattern})' for (name, pattern) in token_specifications
    )
    
    def __init__(self, text):
        self.text = text
        self.regex = re.compile(self.combined_pattern)
    
    def tokenize(self):
        tokens = []
        for match in self.regex.finditer(self.text):
            token_type = match.lastgroup
            token_value = match.group(token_type)
            position = match.start()
            
            # Ignore spaces
            if token_type == 'SPACE':
                continue
            
            tokens.append(Token(token_type, token_value, position))
        return tokens

#############################################
# 2) STREAMLIT APP LAYOUT & FUNCTIONALITY   #
#############################################

# Inject custom CSS for blinking boxes, unique coloring, etc.
st.markdown(
    """
    <style>
    /* Define a blinking animation */
    @keyframes blink {
      0%   { background-color: white; }
      50%  { background-color: lightgreen; }
      100% { background-color: white; }
    }

    .blink-box {
      display: inline-block;
      border: 1px solid #ccc;
      padding: 5px 10px;
      margin: 5px;
      border-radius: 5px;
      animation: blink 2s infinite;
      /* We'll color the text in Python code itself or pass color via style */
    }
    </style>
    """,
    unsafe_allow_html=True
)

st.title("My Tokenizer App")

# Let the user enter text
user_input = st.text_input("Enter your text:", "Hello world 123!")

# When the user clicks the button, we run the tokenizer
if st.button("Tokenize"):
    tokenizer = Tokenizer(user_input)
    tokens = tokenizer.tokenize()
    
    # Show the tokens in blinking boxes, each in a unique color
    # For a simple approach, define a list of colors we can cycle through
    color_list = ["blue", "red", "orange", "purple", "green", "teal", "magenta"]
    
    # We'll display them horizontally (inline)
    for i, tok in enumerate(tokens):
        color = color_list[i % len(color_list)]  # pick color in a round-robin style
        
        # HTML for the blinking box with color, token value on top
        st.markdown(
            f"""
            <div class="blink-box" style="color:{color};">
                <strong>{tok.value}</strong>
            </div>
            """,
            unsafe_allow_html=True
        )
    
    # Add a small separator
    st.write("---")
    
    # Now, for each token, display details below
    # We can do it as separate st.markdown or st.write lines
    for i, tok in enumerate(tokens):
        st.subheader(f"Token {i+1}: {tok.value}")
        st.write(f"**Type:** {tok.type}")
        st.write(f"**Value:** {tok.value}")
        st.write(f"**Position:** {tok.position}")
        st.write("---")