Spaces:

FlameF0X
/

play-with-o1

Build error

App Files Files Community

FlameF0X commited on May 25

Commit

8806ce1

verified ·

1 Parent(s): 8a01c41

Upload 11 files

Browse files

Files changed (11) hide show

src/o1/__init__.py +1 -0
src/o1/__pycache__/__init__.cpython-312.pyc +0 -0
src/o1/__pycache__/agent.cpython-312.pyc +0 -0
src/o1/__pycache__/mcts.cpython-312.pyc +0 -0
src/o1/__pycache__/train.cpython-312.pyc +0 -0
src/o1/__pycache__/utils.cpython-312.pyc +0 -0
src/o1/agent.py +133 -0
src/o1/mcts.py +96 -0
src/o1/selfplay.py +37 -0
src/o1/train.py +162 -0
src/o1/utils.py +144 -0

src/o1/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # o1 package

src/o1/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (146 Bytes). View file

src/o1/__pycache__/agent.cpython-312.pyc ADDED Viewed

Binary file (9.63 kB). View file

src/o1/__pycache__/mcts.cpython-312.pyc ADDED Viewed

Binary file (5.4 kB). View file

src/o1/__pycache__/train.cpython-312.pyc ADDED Viewed

Binary file (8.77 kB). View file

src/o1/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (7 kB). View file

src/o1/agent.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import chess
+class SEBlock(nn.Module):
+    def __init__(self, channels, reduction=16):
+        super().__init__()
+        self.fc1 = nn.Linear(channels, channels // reduction)
+        self.fc2 = nn.Linear(channels // reduction, channels)
+    def forward(self, x):
+        b, c, h, w = x.size()
+        y = x.view(b, c, -1).mean(dim=2)
+        y = F.relu(self.fc1(y))
+        y = torch.sigmoid(self.fc2(y))
+        y = y.view(b, c, 1, 1)
+        return x * y
+class ResidualBlock(nn.Module):
+    def __init__(self, channels, dropout=0.2):
+        super().__init__()
+        self.conv1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
+        self.bn1 = nn.BatchNorm2d(channels)
+        self.conv2 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
+        self.bn2 = nn.BatchNorm2d(channels)
+        self.se = SEBlock(channels)
+        self.dropout = nn.Dropout2d(dropout)
+    def forward(self, x):
+        residual = x
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.bn2(self.conv2(out))
+        out = self.se(out)
+        out = self.dropout(out)
+        out += residual
+        return F.relu(out)
+class ChessNet(nn.Module):
+    def __init__(self, input_channels=17, board_size=8, policy_size=4672, num_blocks=20):
+        super().__init__()
+        self.conv_in = nn.Conv2d(input_channels, 256, kernel_size=3, padding=1)
+        self.bn_in = nn.BatchNorm2d(256)
+        self.res_blocks = nn.Sequential(*[ResidualBlock(256) for _ in range(num_blocks)])
+        self.fc1 = nn.Linear(256 * board_size * board_size, 512)
+        self.ln_fc1 = nn.LayerNorm(512)
+        # Policy head
+        self.policy_head1 = nn.Linear(512, 256)
+        self.policy_head2 = nn.Linear(256, policy_size)
+        # Value head
+        self.value_head1 = nn.Linear(512, 128)
+        self.value_head2 = nn.Linear(128, 1)
+    def forward(self, x):
+        x = F.relu(self.bn_in(self.conv_in(x)))
+        x = self.res_blocks(x)
+        x = x.view(x.size(0), -1)
+        x = F.relu(self.ln_fc1(self.fc1(x)))
+        # Policy head
+        policy = F.relu(self.policy_head1(x))
+        policy = self.policy_head2(policy)
+        # Value head
+        value = F.relu(self.value_head1(x))
+        value = torch.tanh(self.value_head2(value))
+        return policy, value
+class Agent:
+    def __init__(self, device='cpu'):
+        self.device = device
+        self.model = ChessNet().to(device)
+        self.model.eval()
+    def board_to_tensor(self, board):
+        # 12x8x8 binary planes for piece types/colors
+        piece_map = board.piece_map()
+        tensor = np.zeros((17, 8, 8), dtype=np.float32)
+        for square, piece in piece_map.items():
+            idx = self.piece_to_index(piece)
+            row, col = divmod(square, 8)
+            tensor[idx, row, col] = 1
+        # Add castling rights (4 planes)
+        if board.has_kingside_castling_rights(chess.WHITE):
+            tensor[12, :, :] = 1
+        if board.has_queenside_castling_rights(chess.WHITE):
+            tensor[13, :, :] = 1
+        if board.has_kingside_castling_rights(chess.BLACK):
+            tensor[14, :, :] = 1
+        if board.has_queenside_castling_rights(chess.BLACK):
+            tensor[15, :, :] = 1
+        # Add move count (normalized, 1 plane)
+        tensor[16, :, :] = board.fullmove_number / 100.0
+        # Optionally, add repetition or other features here
+        return torch.tensor(tensor, device=self.device).unsqueeze(0)
+    def piece_to_index(self, piece):
+        # 0-5: white P,N,B,R,Q,K; 6-11: black P,N,B,R,Q,K
+        offset = 0 if piece.color == chess.WHITE else 6
+        piece_type_map = {
+            chess.PAWN: 0,
+            chess.KNIGHT: 1,
+            chess.BISHOP: 2,
+            chess.ROOK: 3,
+            chess.QUEEN: 4,
+            chess.KING: 5
+        }
+        return offset + piece_type_map[piece.piece_type]
+    def predict(self, board):
+        x = self.board_to_tensor(board)
+        with torch.no_grad():
+            policy_logits, value = self.model(x)
+        return policy_logits, value
+    def diffusion_sample(self, policy_logits, steps=10, noise_scale=1.0):
+        """
+        Apply a simple diffusion process to the policy logits.
+        At each step, add Gaussian noise and denoise by averaging with the original logits.
+        """
+        x = policy_logits.clone()
+        orig = policy_logits.clone()
+        for _ in range(steps):
+            noise = torch.randn_like(x) * noise_scale
+            x = x + noise
+            x = (x + orig) / 2  # simple denoising step
+        return x
+    def predict_with_diffusion(self, board, steps=10, noise_scale=1.0):
+        x = self.board_to_tensor(board)
+        with torch.no_grad():
+            policy_logits, value = self.model(x)
+            diffused_logits = self.diffusion_sample(policy_logits, steps=steps, noise_scale=noise_scale)
+        return diffused_logits, value

src/o1/mcts.py ADDED Viewed

	@@ -0,0 +1,96 @@

+"""
+Monte Carlo Tree Search (MCTS) for o1 agent.
+Basic implementation: runs simulations, selects moves by visit count.
+Integrate with neural net for policy/value guidance for full strength.
+"""
+import chess
+import random
+from collections import defaultdict
+import torch
+class MCTSNode:
+    def __init__(self, board, parent=None, move=None):
+        self.board = board.copy()
+        self.parent = parent
+        self.move = move
+        self.children = []
+        self.visits = 0
+        self.value = 0.0
+        self.untried_moves = list(board.legal_moves)
+    def is_fully_expanded(self):
+        return len(self.untried_moves) == 0
+    def best_child(self, c_param=1.4):
+        choices = [
+            (child.value / (child.visits + 1e-6) + c_param * ( (2 * (self.visits + 1e-6)) ** 0.5 / (child.visits + 1e-6) ), child)
+            for child in self.children
+        ]
+        return max(choices, key=lambda x: x[0])[1]
+class MCTS:
+    def __init__(self, agent=None, simulations=50):
+        self.agent = agent
+        self.simulations = simulations
+    def search(self, board, restrict_top_n=None):
+        root = MCTSNode(board)
+        for _ in range(self.simulations):
+            node = root
+            sim_board = board.copy()
+            # Selection
+            while node.is_fully_expanded() and node.children:
+                node = node.best_child()
+                sim_board.push(node.move)
+            # Expansion
+            if node.untried_moves:
+                move = random.choice(node.untried_moves)
+                sim_board.push(move)
+                child = MCTSNode(sim_board, parent=node, move=move)
+                node.children.append(child)
+                node.untried_moves.remove(move)
+                node = child
+            # Simulation
+            result = self.simulate(sim_board)
+            # Backpropagation
+            # If it's black's turn at the node, invert the value for correct perspective
+            invert = False
+            temp_node = node
+            while temp_node.parent is not None:
+                temp_node = temp_node.parent
+                invert = not invert
+            value = -result if invert else result
+            while node:
+                node.visits += 1
+                node.value += value
+                node = node.parent
+        # Choose move with most visits, but restrict to top-N if specified
+        if not root.children:
+            return random.choice(list(board.legal_moves))
+        children_sorted = sorted(root.children, key=lambda c: c.visits, reverse=True)
+        if restrict_top_n is not None and restrict_top_n < len(children_sorted):
+            # Only consider top-N moves
+            children_sorted = children_sorted[:restrict_top_n]
+        best = max(children_sorted, key=lambda c: c.visits)
+        return best.move
+    def simulate(self, board, use_diffusion=True, diffusion_steps=10, noise_scale=1.0):
+        # Use neural network to evaluate the board instead of random playout
+        if self.agent is not None:
+            with torch.no_grad():
+                if use_diffusion and hasattr(self.agent, 'predict_with_diffusion'):
+                    _, value = self.agent.predict_with_diffusion(board, steps=diffusion_steps, noise_scale=noise_scale)
+                else:
+                    _, value = self.agent.predict(board)
+                return value.item()
+        # Fallback: play random moves until game ends
+        while not board.is_game_over():
+            move = random.choice(list(board.legal_moves))
+            board.push(move)
+        result = board.result()
+        if result == '1-0':
+            return 1
+        elif result == '0-1':
+            return -1
+        else:
+            return 0

src/o1/selfplay.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""
+Self-play orchestration for o1 agent.
+Runs self-play games using MCTS for move selection.
+"""
+import chess
+from o1.mcts import MCTS
+def run_selfplay(agent, num_games=1, simulations=50):
+    """Run self-play games using MCTS and return experience."""
+    all_experience = []
+    for game_idx in range(num_games):
+        board = chess.Board()
+        mcts = MCTS(agent, simulations=simulations)
+        game_data = []
+        while not board.is_game_over():
+            move = mcts.search(board)
+            state_tensor = agent.board_to_tensor(board)
+            # Policy: one-hot for chosen move (for now)
+            policy = [0] * 4672  # 4672 is max legal moves in chess
+            move_idx = list(board.legal_moves).index(move)
+            policy[move_idx] = 1
+            value = 0  # Placeholder, will be set after game
+            game_data.append((state_tensor, policy, value))
+            board.push(move)
+        # Assign final result as value for all positions
+        result = board.result()
+        if result == '1-0':
+            z = 5
+        elif result == '0-1':
+            z = -1
+        else:
+            z = 0
+        game_data = [(s, p, z) for (s, p, v) in game_data]
+        all_experience.extend(game_data)
+    return all_experience
+# Self-play loop implementation will go here

src/o1/train.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import chess
+import random
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from o1.agent import Agent
+from o1.mcts import MCTS
+from o1.utils import save_board_svg, save_model
+class ExperienceBuffer:
+    def __init__(self, max_size=10000):
+        self.buffer = []
+        self.max_size = max_size
+    def add(self, experience):
+        if len(self.buffer) >= self.max_size:
+            self.buffer.pop(0)
+        self.buffer.append(experience)
+    def sample(self, batch_size):
+        return random.sample(self.buffer, min(batch_size, len(self.buffer)))
+    def get_tensors(self, batch):
+        # Convert batch of (state_tensor, policy, value) to tensors
+        # Ensure state tensors are float32 and have correct shape
+        states = torch.cat([s.float() for (s, _, _) in batch], dim=0)
+        policies = torch.tensor([p for (_, p, _) in batch], dtype=torch.float32)
+        values = torch.tensor([v for (_, _, v) in batch], dtype=torch.float32).unsqueeze(1)
+        return states, policies, values
+def self_play_game(agent, simulations=10, save_svg=False, svg_prefix="game", max_moves=40):
+    # Randomly choose o1's color for this game
+    o1_color = random.choice([chess.WHITE, chess.BLACK])
+    board = chess.Board()
+    mcts = MCTS(agent, simulations=simulations)
+    game_data = []
+    move_num = 0
+    print(f"o1 is playing as {'White' if o1_color == chess.WHITE else 'Black'}")
+    while not board.is_game_over() and move_num < max_moves:
+        # Determine if it's o1's turn
+        o1_turn = (board.turn == o1_color)
+        if o1_turn:
+            move = mcts.search(board)
+        else:
+            # Opponent: random move
+            move = random.choice(list(board.legal_moves))
+        print(f"Move {move_num + 1}: {move}")
+        state_tensor = agent.board_to_tensor(board)
+        policy = [0] * 4672
+        move_idx = list(board.legal_moves).index(move)
+        policy[move_idx] = 1
+        value = 0  # Placeholder, will be set after game
+        game_data.append((state_tensor, policy, value))
+        board.push(move)
+        if save_svg:
+            save_board_svg(board, f"{svg_prefix}_move{move_num}.svg")
+        move_num += 1
+    print(f"Game ended after {move_num} moves")
+    print(f"Final position:\n{board}")
+    penalty = 0
+    if board.is_game_over():
+        outcome = board.outcome(claim_draw=True)
+        if outcome:
+            termination = outcome.termination.name
+            if outcome.winner is None:
+                if termination == "STALEMATE":
+                    winner_str = "Draw (stalemate)"
+                    z = 0
+                elif termination == "INSUFFICIENT_MATERIAL":
+                    winner_str = "Draw (insufficient material)"
+                    z = 0
+                    penalty = z
+                else:
+                    winner_str = f"Draw ({termination.lower()})"
+                    z = 0
+                    penalty = z
+            elif outcome.winner:
+                winner_str = "White wins"
+                if o1_color == chess.WHITE:
+                    z = 5
+                else:
+                    z = -1  # Penalize o1 if it was black and lost
+            else:
+                winner_str = "Black wins"
+                if o1_color == chess.BLACK:
+                    z = 5
+                else:
+                    z = -1  # Penalize o1 if it was white and lost
+            print(f"Game over reason: {board.result()} ({termination})")
+            print(f"Result: {winner_str}")
+            if penalty:
+                print(f"Penalty applied: {penalty}")
+        else:
+            print(f"Game over reason: {board.result()} (unknown termination)")
+            z = 0
+            print(f"Penalty applied: {z}")
+    else:
+        print("Game reached move limit - applying increased penalty")
+        print("Result: No winner (move limit reached)")
+        z = -2.0
+        print(f"Penalty applied: {z}")
+    game_data = [(s, p, z) for (s, p, v) in game_data]
+    if save_svg:
+        save_board_svg(board, f"{svg_prefix}_final.svg")
+    return game_data
+def train_step(agent, buffer, optimizer, batch_size=32):
+    if len(buffer.buffer) < batch_size:
+        return
+    batch = buffer.sample(batch_size)
+    states, target_policies, target_values = buffer.get_tensors(batch)
+    agent.model.train()
+    optimizer.zero_grad()
+    pred_policies, pred_values = agent.model(states)
+    # Policy loss (cross-entropy)
+    policy_loss = -torch.sum(target_policies * torch.log_softmax(pred_policies, dim=1)) / batch_size
+    # Value loss (MSE)
+    value_loss = nn.functional.mse_loss(pred_values, target_values)
+    loss = policy_loss + value_loss
+    loss.backward()
+    optimizer.step()
+    print(f"Train step: loss={loss.item():.4f} (policy={policy_loss.item():.4f}, value={value_loss.item():.4f})")
+def main():
+    agent = Agent()
+    # Try to load pretrained weights if available
+    import os
+    from o1.utils import load_model
+    pretrained_path = "trained_agent.pth"
+    if os.path.exists(pretrained_path):
+        print(f"Loading pretrained weights from {pretrained_path}...")
+        load_model(agent, pretrained_path)
+    else:
+        print("No pretrained weights found. Training from scratch.")
+    buffer = ExperienceBuffer()
+    optimizer = optim.Adam(agent.model.parameters(), lr=1e-4)
+    num_games = 20  # Increased from 50 for more training data
+    global_reward = 0
+    for i in range(num_games):
+        print(f"Self-play game {i+1}")
+        # Only save video for the last game
+        save_video = (i == num_games - 1)
+        game_experience = self_play_game(agent, simulations=10, max_moves=300,
+                                       save_svg=save_video,
+                                       svg_prefix=f"final_game")
+        for exp in game_experience:
+            buffer.add(exp)
+        # Log the reward for this game (all z are the same for the game)
+        if game_experience:
+            game_reward = game_experience[0][2]
+            global_reward += game_reward
+            print(f"Reward for this game: {game_reward}")
+            print(f"Cumulative global reward: {global_reward}")
+        train_step(agent, buffer, optimizer)
+    print("Pipeline complete. Self-play now uses MCTS for move selection and real learning.")
+    # Save the trained model at the end
+    save_model(agent, "trained_agent.pth")
+    print("Model saved as trained_agent.pth")
+if __name__ == "__main__":
+    main()

src/o1/utils.py ADDED Viewed

	@@ -0,0 +1,144 @@

+"""
+Utility functions for o1 agent.
+Includes ELO calculation, FEN helpers, experience save/load, and more.
+"""
+import pickle
+import chess
+import torch
+import chess.svg
+import os
+def calculate_elo(rating_a, rating_b, result, k=32):
+    """Update ELO rating for player A given result (1=win, 0.5=draw, 0=loss)."""
+    expected = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
+    new_rating = rating_a + k * (result - expected)
+    return new_rating
+def board_to_fen(board):
+    """Convert a chess.Board to FEN string."""
+    return board.fen()
+def fen_to_board(fen):
+    """Convert a FEN string to chess.Board."""
+    return chess.Board(fen)
+def save_experience(buffer, filename):
+    """Save experience buffer to file."""
+    with open(filename, 'wb') as f:
+        pickle.dump(buffer.buffer, f)
+def load_experience(filename):
+    """Load experience buffer from file."""
+    with open(filename, 'rb') as f:
+        return pickle.load(f)
+def save_model(agent, filename):
+    """Save the agent's model to a file."""
+    torch.save(agent.model.state_dict(), filename)
+def load_model(agent, filename):
+    """Load the agent's model from a file."""
+    agent.model.load_state_dict(torch.load(filename))
+    agent.model.eval()
+def save_board_svg(board, filename):
+    """Save the current board position as an SVG image."""
+    svg_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "game_svgs")
+    os.makedirs(svg_dir, exist_ok=True)
+    filepath = os.path.join(svg_dir, filename)
+    svg = chess.svg.board(board=board)
+    with open(filepath, 'w') as f:
+        f.write(svg)
+# Option 3: Model Architecture Tuning
+# Try deeper/smaller networks, different block types, or alternative architectures.
+def try_alternative_architectures(agent_class, architectures):
+    """Try different model architectures and return their performance."""
+    results = {}
+    for arch in architectures:
+        agent = agent_class(arch=arch)
+        # Evaluate agent (placeholder, implement evaluation logic)
+        results[arch] = None  # Fill with actual evaluation
+    return results
+# Option 4: Hyperparameter Tuning
+# Use grid/random search to find optimal hyperparameters.
+def grid_search(train_func, param_grid):
+    """Perform grid search over hyperparameters."""
+    import itertools
+    keys, values = zip(*param_grid.items())
+    best_score = None
+    best_params = None
+    for v in itertools.product(*values):
+        params = dict(zip(keys, v))
+        score = train_func(**params)
+        if best_score is None or score > best_score:
+            best_score = score
+            best_params = params
+    return best_params, best_score
+# Option 5: Regularization
+# Add dropout, L2 regularization, or early stopping.
+def add_regularization(model, dropout=0.2, l2=1e-4):
+    """Add dropout and L2 regularization to the model."""
+    # This is a placeholder; actual implementation depends on model code
+    for module in model.modules():
+        if hasattr(module, 'dropout'):
+            module.dropout.p = dropout
+    return model, l2
+# Option 6: Cross-Validation
+# Use k-fold cross-validation for robust evaluation.
+def k_fold_cross_validation(train_func, k=5, *args, **kwargs):
+    """Perform k-fold cross-validation."""
+    scores = []
+    for i in range(k):
+        score = train_func(fold=i, *args, **kwargs)
+        scores.append(score)
+    return sum(scores) / len(scores)
+# Option 7: Ensemble Methods
+# Combine multiple models for better performance.
+def ensemble_predict(models, input_tensor):
+    """Average predictions from multiple models."""
+    outputs = [model(input_tensor) for model in models]
+    # Assume outputs are tuples (policy, value)
+    avg_policy = sum([o[0] for o in outputs]) / len(outputs)
+    avg_value = sum([o[1] for o in outputs]) / len(outputs)
+    return avg_policy, avg_value
+# ELO Evaluation for Model
+def evaluate_model_elo(agent, opponent, num_games=20, initial_elo=1500):
+    """Play games between agent and opponent, return estimated ELO for agent."""
+    agent_elo = initial_elo
+    opp_elo = initial_elo
+    import random
+    import chess
+    from o1.mcts import MCTS
+    for i in range(num_games):
+        board = chess.Board()
+        mcts_agent = MCTS(agent)
+        mcts_opp = MCTS(opponent)
+        turn = random.choice([True, False])
+        while not board.is_game_over():
+            if board.turn == turn:
+                move = mcts_agent.search(board)
+            else:
+                move = mcts_opp.search(board)
+            board.push(move)
+        result = board.result()
+        if result == '1-0':
+            agent_score = 1 if turn else 0
+        elif result == '0-1':
+            agent_score = 0 if turn else 1
+        else:
+            agent_score = 0.5
+        agent_elo = calculate_elo(agent_elo, opp_elo, agent_score)
+        opp_elo = calculate_elo(opp_elo, agent_elo, 1 - agent_score)
+    return agent_elo