import math import re import numpy as np import torch from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM, ) from collections import Counter class AITextDetector: """ AI Text Detector - Transformer classifier for AI vs Human - Metrics: perplexity, burstiness, repetition, semantic smoothness - Returns AI-vs-Human probability + category distribution """ def __init__(self, model_name="roberta-base-openai-detector", device=None): # Device setup self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") # Classifier model & tokenizer self.classifier_tokenizer = AutoTokenizer.from_pretrained(model_name) self.model = AutoModelForSequenceClassification.from_pretrained(model_name).to(self.device) self.model.eval() # Language model for perplexity (lighter than full GPT-2 if needed) self.lm_tokenizer = AutoTokenizer.from_pretrained("gpt2") self.lm_model = AutoModelForCausalLM.from_pretrained("gpt2").to(self.device) self.lm_model.eval() # ------------------ Metrics ------------------ def _compute_perplexity(self, text: str, max_length: int = 512): """Compute perplexity using GPT-2 LM.""" encodings = self.lm_tokenizer( text, return_tensors="pt", truncation=True, max_length=max_length, ).to(self.device) with torch.no_grad(): outputs = self.lm_model(**encodings, labels=encodings.input_ids) loss = outputs.loss.item() # Clamp to avoid overflow return float(min(math.exp(loss), 1e4)) def _compute_burstiness(self, text: str): """Variance of sentence lengths (burstiness).""" sentences = [s.strip() for s in re.split(r"[.!?]", text) if s.strip()] if len(sentences) < 2: return 0.0 lengths = [len(s.split()) for s in sentences] return float(np.var(lengths)) def _compute_repetition_score(self, text: str): """Repetition = proportion of duplicate words.""" words = [w.lower() for w in re.findall(r"\b\w+\b", text)] if not words: return 0.0 counts = Counter(words) repeated = sum(c - 1 for c in counts.values() if c > 1) return repeated / len(words) def _compute_semantic_smoothness(self, text: str): """ Semantic smoothness = avg cosine similarity between consecutive sentence embeddings. Uses last hidden states instead of raw embeddings. """ sentences = [s.strip() for s in re.split(r"[.!?]", text) if s.strip()] if len(sentences) < 2: return 1.0 embeddings = [] for s in sentences: encodings = self.classifier_tokenizer( s, return_tensors="pt", truncation=True, padding=True, max_length=128, ).to(self.device) with torch.no_grad(): outputs = self.model( **encodings, output_hidden_states=True, ) hidden_states = outputs.hidden_states[-1] # last layer sent_emb = hidden_states.mean(dim=1).cpu().numpy() embeddings.append(sent_emb) similarities = [] for i in range(len(embeddings) - 1): a, b = embeddings[i], embeddings[i + 1] num = float(np.dot(a, b.T)) denom = np.linalg.norm(a) * np.linalg.norm(b) if denom > 0: similarities.append(num / denom) return float(np.mean(similarities)) if similarities else 1.0 # ------------------ Main detection ------------------ def detect(self, text: str): """Run detection pipeline and return results.""" # Empty text case if not text.strip(): return { "ai_probability": 0.0, "metrics": {}, "distribution": {}, "final_label": "empty", } # Classifier prediction inputs = self.classifier_tokenizer( text, return_tensors="pt", truncation=True, padding=True, max_length=512, ).to(self.device) with torch.no_grad(): logits = self.model(**inputs).logits probs = torch.softmax(logits, dim=1).cpu().numpy()[0] human_prob, ai_prob = float(probs[0]), float(probs[1]) # Extra metrics perplexity = self._compute_perplexity(text) burstiness = self._compute_burstiness(text) repetition = self._compute_repetition_score(text) smoothness = self._compute_semantic_smoothness(text) # Normalize distribution distribution = { "Human-written": round(human_prob * 100, 2), "AI-generated": round(ai_prob * 100 * (1 - repetition), 2), "AI-generated & AI-refined": round(ai_prob * 100 * repetition, 2), "Mixed": round(ai_prob * 100 * (1 - smoothness), 2), } total = sum(distribution.values()) if total > 0: for k in distribution: distribution[k] = round(distribution[k] / total * 100, 2) # Final label final_label = max(distribution, key=distribution.get) return { "summary": f"{distribution['AI-generated']}% of text is likely AI", "overall_ai_probability": overall_ai_probability, "category_distribution": distribution, "metrics": { "perplexity": round(perplexity, 2), "burstiness": round(burstiness, 3), "repetition_score": round(repetition, 3), "semantic_smoothness": round(smoothness, 3), "ai_probability": overall_ai_probability, }, "interpretation": ( "This detector uses structural patterns (perplexity, burstiness, repetition, semantic smoothness) " "to estimate the likelihood of AI authorship. Results are probabilistic, not definitive. " "Always apply judgment." ), "label": "AI-generated" if overall_ai_probability > 0.5 else "Human-written" }