Spaces:

lingyu98
/

CiJiang

Sleeping

App Files Files Community

lingyu98 commited on Aug 4

Commit

f4134b8

verified ·

1 Parent(s): 99a698e

Create cijiang/rhyme.py

Browse files

Files changed (1) hide show

cijiang/rhyme.py +240 -0

cijiang/rhyme.py ADDED Viewed

	@@ -0,0 +1,240 @@

+import json
+import torch
+import numpy as np
+from tqdm import tqdm
+from collections import namedtuple
+from typing import List, Tuple, Dict
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from pypinyin import pinyin, Style
+BeamEntry = namedtuple('BeamEntry', ['sequence', 'log_prob', 'position'])
+def is_pinyin(syllable):
+    """Check if a syllable is a valid pinyin syllable"""
+    try:
+        syllable.encode('ascii')
+    except UnicodeEncodeError:
+        return False
+    return True
+class CiJiangRhymer:
+    def __init__(self, strict=True, tone=True, heteronym=False):
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self._load_model()
+        self._load_rules()
+        self.tone = tone
+        self.heteronym = heteronym
+        if strict:
+            self.mode = 'strict'
+        else:
+            self.mode = 'blurry'
+        # Pre-compute character mappings for efficiency
+        self._build_character_cache()
+    def _load_model(self):
+        model_name = "Qwen/Qwen3-0.6B-Base"  # Changed to base model
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        # Add padding token if it doesn't exist
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype="auto",
+            device_map="auto"
+        )
+        self.model.eval()
+        # Note: torch.compile may not work with all versions, comment out if issues
+        self.vocab = self.tokenizer.get_vocab()
+    def _load_rules(self):
+        with open('rules/syllable_to_yunmu.json', 'r', encoding='utf-8') as f:
+            self.syllable_to_yunmu = json.load(f)
+        with open('rules/ALL_SYLLABLES.txt', 'r', encoding='utf-8') as f:
+            all_syllables = f.read().strip().split()
+            self.all_syllables = [syllable for syllable in all_syllables if syllable]
+        with open('rules/rhymes.json', 'r', encoding='utf-8') as f:
+            self.rhymes = json.load(f)
+    def _build_character_cache(self):
+        """Pre-compute character to pinyin mappings for all vocabulary tokens"""
+        print("Building character cache for faster lookup...")
+        self.char_to_pinyins = {}
+        self.token_to_char: Dict[int, str] = {}
+        for token_id in tqdm(range(len(self.vocab)), desc="Caching characters"):
+            char = self.tokenizer.decode(token_id).strip()
+            if len(char) == 1 and '\u4e00' <= char <= '\u9fff':
+                self.token_to_char[token_id] = char
+                # Cache pinyin for this character if not already done
+                if char not in self.char_to_pinyins:
+                    hetero_pinyins = pinyin(char, style=Style.TONE3,
+                                   heteronym=True, neutral_tone_with_five=True)[0]
+                    pinyins = pinyin(char, style=Style.TONE3,
+                                   heteronym=False, neutral_tone_with_five=True)[0]
+                    self.char_to_pinyins[char] = {
+                        "hetero": hetero_pinyins,
+                        "single": pinyins
+                    }
+    def _prefilter_tokens_by_rhyme(self, top_tokens: torch.Tensor, top_log_probs: torch.Tensor,
+                                  allowed_rhymes: set, target_tone: str) -> List[Tuple[str, float, int]]:
+        """Pre-filter tokens that match rhyming requirements using cached data"""
+        matching_candidates = []
+        token_ids = top_tokens.to(torch.float32).cpu().numpy()
+        log_probs = top_log_probs.to(torch.float32).cpu().numpy()
+        for i, token_id in enumerate(token_ids):
+            char = self.token_to_char.get(int(token_id))
+            if char is None:
+                continue
+            candidate_pinyins = self.char_to_pinyins[char]["hetero" if self.heteronym else "single"]
+            for candidate_pinyin in candidate_pinyins:
+                if len(candidate_pinyin) < 2:
+                    continue
+                candidate_syllable, candidate_tone = candidate_pinyin[:-1], candidate_pinyin[-1]
+                yunmu = self.syllable_to_yunmu.get(candidate_syllable)
+                if self.tone==False: candidate_tone = target_tone  # Ignore tone if not required
+                if (yunmu in allowed_rhymes and
+                    (candidate_tone == target_tone or target_tone == '5' or candidate_tone == '5')):
+                    matching_candidates.append((char, float(log_probs[i]), int(token_id)))
+                    break
+        return matching_candidates
+    def _get_next_token_probabilities(self, prompt: str, num_candidates: int = 200) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Get probabilities for next token using base model"""
+        # Simplified approach for base model - no chat formatting needed
+        model_inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
+        with torch.inference_mode():
+            outputs = self.model(**model_inputs)
+            # Get logits for the next token (last position)
+            next_token_logits = outputs.logits[0, -1, :]
+            # Get top candidates
+            top_k_result = next_token_logits.topk(min(num_candidates, next_token_logits.size(0)))
+            top_tokens = top_k_result.indices
+            top_log_probs = torch.log_softmax(next_token_logits, dim=-1)[top_tokens]
+            return top_tokens, top_log_probs
+    def get_rhymes(self, text_with_placeholder: str, target_rhyme: str,
+                   beam_width: int = 5, num_candidates: int = 200) -> List[Tuple[str, float]]:
+        """
+        Generate rhyming text using Qwen3 base language model
+        Args:
+            text_with_placeholder: Text with placeholders (e.g., "恰似一江春水[M][M][M]")
+            target_rhyme: Target rhyme pattern
+            beam_width: Number of beams to maintain during search
+            num_candidates: Number of top candidates to consider at each step
+        Returns:
+            List of (sequence, log_probability) tuples ranked by likelihood
+        """
+        if is_pinyin(target_rhyme):
+            target_rhyme_pinyin = target_rhyme.split(' ')
+        else:
+            target_rhyme_pinyin = [pinyin(rhyme, style=Style.TONE3, heteronym=False,
+                                        neutral_tone_with_five=True)[0][0] for rhyme in target_rhyme]
+        # print(f"Target rhyme pinyin: {target_rhyme_pinyin}")
+        # Count placeholders to replace
+        placeholder_count = text_with_placeholder.count('[M]')
+        if placeholder_count != len(target_rhyme_pinyin):
+            print(f"Warning: Number of placeholders ({placeholder_count}) doesn't match target rhyme length ({len(target_rhyme_pinyin)})")
+        # Initialize beam with the original sequence (remove placeholders for now)
+        base_text = text_with_placeholder.replace('[M]', '')
+        if len(base_text) == 0:
+            # add some base text if empty
+            base_text = "一个常见词汇是："
+        beam = [BeamEntry(sequence=base_text, log_prob=0.0, position=0)]
+        # Process each character in the target rhyme
+        # for i in range(len(target_rhyme_pinyin)):
+        for i in tqdm(range(len(target_rhyme_pinyin)), desc="Generating rhymes"):
+            new_beam = []
+            syl = target_rhyme_pinyin[i]
+            syllable, tone = syl[:-1], syl[-1]
+            allowed_rhymes = set(self.rhymes.get(self.syllable_to_yunmu.get(syllable, None), {}).get(self.mode, []))
+            # Process each sequence in current beam
+            for beam_entry in beam:
+                current_sequence = beam_entry.sequence
+                current_log_prob = beam_entry.log_prob
+                # Create prompt for next character (simplified for base model)
+                prompt = current_sequence
+                # Get next token probabilities
+                try:
+                    top_tokens, top_log_probs = self._get_next_token_probabilities(prompt, num_candidates)
+                except Exception as e:
+                    print(f"Error getting probabilities: {e}")
+                    continue
+                # print(current_sequence)
+                # Use optimized filtering
+                matching_candidates = self._prefilter_tokens_by_rhyme(
+                    top_tokens, top_log_probs, allowed_rhymes, tone
+                )
+                # print(matching_candidates)
+                # Add matching candidates to new beam
+                for char, log_prob_value, token_id in matching_candidates:
+                    new_sequence = current_sequence + char
+                    new_beam.append(BeamEntry(
+                        sequence=new_sequence,
+                        log_prob=current_log_prob + log_prob_value,
+                        position=i + 1
+                    ))
+            # Keep only top beam_width candidates
+            if new_beam:
+                new_beam.sort(key=lambda x: x.log_prob, reverse=True)
+                beam = new_beam[:beam_width]
+            else:
+                print(f"Warning: No valid candidates found for position {i} (syllable: {syl})")
+                break
+        # Return final results sorted by probability
+        if not beam:
+            return []
+        final_results = [(entry.sequence, np.exp(entry.log_prob/10)) for entry in beam]
+        final_results.sort(key=lambda x: x[1], reverse=True)
+        return final_results
+# Example usage:
+if __name__ == "__main__":
+    # Initialize the rhymer
+    rhymer = CiJiangRhymer(strict=False, tone=True)
+    # Example: Generate rhyming text
+    base_text = "��人给你[M][M][M][M]"
+    # target_rhyme = "摆摊算命"  # Target rhyme pattern
+    target_rhyme = "bai3 tan1 suan4 ming4"  # Pinyin representation for testing
+    results = rhymer.get_rhymes(base_text, target_rhyme, beam_width=10, num_candidates=5000)
+    print("Generated rhyming completions:")
+    for i, (sequence, prob) in enumerate(results):
+        print(f"{i+1}. {sequence} (probability: {prob:.4f})")