shevadesuyash commited on
Commit
47d18ca
·
verified ·
1 Parent(s): a8382bd

Upload 6 files

Browse files
Files changed (6) hide show
  1. Dockerfile +31 -0
  2. app.py +92 -0
  3. cache_models.py +37 -0
  4. grammar_chatbot.py +159 -0
  5. paragraph_checker.py +62 -0
  6. requirements.txt +4 -0
Dockerfile ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Base image
2
+ FROM python:3.10-slim
3
+
4
+ # Environment variables
5
+ ENV PYTHONDONTWRITEBYTECODE=1 \
6
+ PYTHONUNBUFFERED=1
7
+
8
+ # Install dependencies
9
+ RUN apt-get update && apt-get install -y \
10
+ openjdk-17-jre-headless \
11
+ git \
12
+ && rm -rf /var/lib/apt/lists/*
13
+
14
+ # Set working directory
15
+ WORKDIR /app
16
+
17
+ # Copy all project files
18
+ COPY . .
19
+
20
+ # Install Python packages
21
+ RUN pip install --upgrade pip
22
+ RUN pip install --no-cache-dir -r requirements.txt
23
+
24
+ # Cache models during build
25
+ RUN python cache_models.py
26
+
27
+ # Expose port
28
+ EXPOSE 8080
29
+
30
+ # Run the Flask app
31
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify
2
+ from paragraph_checker import ParagraphCorrector
3
+ from grammar_chatbot import GrammarChatbot
4
+ import logging
5
+
6
+ app = Flask(__name__)
7
+
8
+ # Initialize services
9
+ paragraph_service = ParagraphCorrector()
10
+ chatbot_service = GrammarChatbot()
11
+
12
+ # Configure logging
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+ @app.route('/correct_text', methods=['POST'])
17
+ def handle_paragraph():
18
+ """Endpoint for conservative paragraph correction"""
19
+ data = request.get_json()
20
+ text = data.get('paragraph', '').strip()
21
+
22
+ if not text:
23
+ return jsonify({"error": "No paragraph provided"}), 400
24
+
25
+ try:
26
+ corrected = paragraph_service.conservative_correction(text)
27
+ return jsonify({
28
+ "original_text": text,
29
+ "grammar_corrected": corrected
30
+ })
31
+ except Exception as e:
32
+ logger.error(f"Paragraph correction error: {str(e)}")
33
+ return jsonify({
34
+ "error": "Paragraph processing failed",
35
+ "details": str(e)
36
+ }), 500
37
+
38
+ @app.route('/chat', methods=['POST', 'GET']) # Added GET method for testing
39
+ def handle_chat():
40
+ """Endpoint for fluent conversational correction"""
41
+ if request.method == 'POST':
42
+ data = request.get_json()
43
+ user_input = data.get('message', '').strip()
44
+ else: # GET method for testing
45
+ user_input = request.args.get('message', '').strip()
46
+
47
+ if not user_input:
48
+ return jsonify({"error": "No message provided"}), 400
49
+
50
+ try:
51
+ response = chatbot_service.generate_response(user_input)
52
+ return jsonify({
53
+ "original_text": response["original_text"],
54
+ "corrected_text": response["corrected_text"],
55
+ "is_corrected": response["is_corrected"],
56
+ "compliment": response["compliment"],
57
+ "next_question": response["next_question"],
58
+ "end_conversation": response["end_conversation"]
59
+ })
60
+ except Exception as e:
61
+ logger.error(f"Chatbot error: {str(e)}")
62
+ return jsonify({
63
+ "error": "Chat processing failed",
64
+ "details": str(e)
65
+ }), 500
66
+
67
+ @app.route('/start', methods=['GET'])
68
+ def start_conversation():
69
+ try:
70
+ response = chatbot_service.start_conversation()
71
+ return jsonify(response)
72
+ except Exception as e:
73
+ logger.error(f"Start conversation error: {str(e)}")
74
+ return jsonify({
75
+ "error": "Failed to start conversation",
76
+ "details": str(e)
77
+ }), 500
78
+
79
+ @app.route('/health', methods=['GET', 'POST']) # Added POST method for testing
80
+ def health_check():
81
+ return jsonify({
82
+ "status": "healthy",
83
+ "services": ["paragraph", "chat"],
84
+ "details": {
85
+ "paragraph_service": "active",
86
+ "chatbot_service": "active"
87
+ }
88
+ })
89
+
90
+ if __name__ == '__main__':
91
+ logger.info("Starting grammar services...")
92
+ app.run(host='0.0.0.0', port=8080, debug=True)
cache_models.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import language_tool_python
2
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
3
+ import torch
4
+
5
+ def pre_cache_models():
6
+ """
7
+ Downloads and caches all required models and dependencies.
8
+ This script is run during the Docker build process.
9
+ """
10
+ print("Caching LanguageTool model...")
11
+ try:
12
+ # This will download and cache the LanguageTool server files
13
+ language_tool_python.LanguageTool('en-US')
14
+ print("LanguageTool model cached successfully.")
15
+ except Exception as e:
16
+ print(f"Failed to cache LanguageTool: {e}")
17
+
18
+ print("\nCaching Hugging Face models...")
19
+ models_to_cache = [
20
+ "vennify/t5-base-grammar-correction",
21
+ "humarin/chatgpt_paraphraser_on_T5_base"
22
+ ]
23
+
24
+ for model_name in models_to_cache:
25
+ try:
26
+ print(f"Caching {model_name}...")
27
+ # Cache both tokenizer and model files
28
+ AutoTokenizer.from_pretrained(model_name)
29
+ AutoModelForSeq2SeqLM.from_pretrained(model_name)
30
+ print(f"{model_name} cached successfully.")
31
+ except Exception as e:
32
+ print(f"Failed to cache {model_name}: {e}")
33
+
34
+ print("\nAll models have been cached.")
35
+
36
+ if __name__ == "__main__":
37
+ pre_cache_models()
grammar_chatbot.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
2
+ import torch
3
+ import random
4
+ from typing import Dict, List
5
+
6
+ class GrammarChatbot:
7
+ def __init__(self):
8
+ """Initialize models for fluent corrections"""
9
+ # Initialize models
10
+ self.grammar_model = pipeline(
11
+ "text2text-generation",
12
+ model="vennify/t5-base-grammar-correction",
13
+ device=0 if torch.cuda.is_available() else -1
14
+ )
15
+
16
+ # Fluent paraphrasing model
17
+ self.paraphrase_tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
18
+ self.paraphrase_model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
19
+
20
+ # Enhanced conversation components
21
+ self.compliments = [
22
+ "Great job! Your English is improving!",
23
+ "Nice improvement! Keep it up!",
24
+ "You're doing well with your practice!",
25
+ "Good effort! I can see you're trying hard!",
26
+ "Excellent try! You're getting better!",
27
+ "Well done! That was much better!",
28
+ "Impressive! Your sentence structure is improving!"
29
+ ]
30
+
31
+ # Organized question bank by categories
32
+ self.question_categories = {
33
+ "daily_life": [
34
+ "What did you do this morning?",
35
+ "How do you usually spend your evenings?",
36
+ "What's your morning routine like?",
37
+ "Do you have any plans for this weekend?",
38
+ "What time do you usually wake up?"
39
+ ],
40
+ "hobbies": [
41
+ "What hobbies do you enjoy in your free time?",
42
+ "Have you picked up any new hobbies recently?",
43
+ "Do you prefer indoor or outdoor activities?",
44
+ "What's something you've always wanted to try?",
45
+ "Do you play any musical instruments?"
46
+ ],
47
+ "food": [
48
+ "What's your favorite comfort food?",
49
+ "Do you enjoy cooking? What's your specialty?",
50
+ "What's the most unusual food you've ever tried?",
51
+ "Do you prefer sweet or savory snacks?",
52
+ "What's your go-to breakfast?"
53
+ ],
54
+ "travel": [
55
+ "If you could visit any country, where would you go?",
56
+ "What's the most beautiful place you've ever seen?",
57
+ "Do you prefer beach vacations or city trips?",
58
+ "What's your favorite travel memory?",
59
+ "What's the next place you'd like to visit?"
60
+ ],
61
+ "technology": [
62
+ "How do you use technology in your daily life?",
63
+ "What's your opinion about social media?",
64
+ "Do you think AI will change our lives significantly?",
65
+ "What tech gadget couldn't you live without?",
66
+ "How has technology changed your work/studies?"
67
+ ],
68
+ "future": [
69
+ "Where do you see yourself in five years?",
70
+ "What's one skill you'd like to learn?",
71
+ "Do you have any big goals for this year?",
72
+ "What would your perfect day look like?",
73
+ "What's something you want to achieve?"
74
+ ]
75
+ }
76
+
77
+ self.current_question = None
78
+ self.current_category = None
79
+ self.conversation_history = []
80
+
81
+ def correct_grammar(self, text: str) -> str:
82
+ """Basic grammar correction"""
83
+ result = self.grammar_model(
84
+ text,
85
+ max_length=256,
86
+ num_beams=4,
87
+ early_stopping=True
88
+ )
89
+ return result[0]['generated_text']
90
+
91
+ def fluent_correction(self, text: str) -> str:
92
+ """Create fluent, natural rewrites"""
93
+ input_ids = self.paraphrase_tokenizer(
94
+ f"paraphrase: {text}",
95
+ return_tensors="pt",
96
+ max_length=256,
97
+ truncation=True
98
+ ).input_ids
99
+
100
+ outputs = self.paraphrase_model.generate(
101
+ input_ids,
102
+ temperature=0.7,
103
+ max_length=256,
104
+ num_beams=5,
105
+ early_stopping=True
106
+ )
107
+
108
+ return self.paraphrase_tokenizer.decode(outputs[0], skip_special_tokens=True)
109
+
110
+ def _get_next_question(self) -> str:
111
+ """Select next question based on conversation context"""
112
+ # If we're continuing a category, use follow-up questions
113
+ if self.current_category and random.random() < 0.6: # 60% chance to stay on topic
114
+ return random.choice(self.question_categories[self.current_category])
115
+
116
+ # Otherwise select a new random category
117
+ self.current_category = random.choice(list(self.question_categories.keys()))
118
+ return random.choice(self.question_categories[self.current_category])
119
+
120
+ def generate_response(self, user_input: str) -> Dict:
121
+ """Generate a conversational response"""
122
+ # Store conversation history
123
+ self.conversation_history.append(("user", user_input))
124
+
125
+ # Correct the input
126
+ corrected = self.fluent_correction(user_input)
127
+ is_corrected = corrected.lower() != user_input.lower()
128
+
129
+ # Generate response
130
+ response = {
131
+ "original_text": user_input,
132
+ "corrected_text": corrected,
133
+ "is_corrected": is_corrected,
134
+ "compliment": random.choice(self.compliments) if is_corrected else "",
135
+ "next_question": self._get_next_question(),
136
+ "end_conversation": False
137
+ }
138
+
139
+ # Update state
140
+ self.current_question = response["next_question"]
141
+ self.conversation_history.append(("bot", response["next_question"]))
142
+
143
+ return response
144
+
145
+ def start_conversation(self) -> Dict:
146
+ """Initialize a new conversation"""
147
+ self.conversation_history = []
148
+ self.current_category = random.choice(list(self.question_categories.keys()))
149
+ self.current_question = random.choice(self.question_categories[self.current_category])
150
+
151
+ return {
152
+ "message": "Hello! I'm your English practice partner. Let's chat!",
153
+ "next_question": self.current_question,
154
+ "end_conversation": False
155
+ }
156
+
157
+ def get_conversation_history(self) -> List[tuple]:
158
+ """Get the complete conversation history"""
159
+ return self.conversation_history
paragraph_checker.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import language_tool_python
2
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
3
+ import torch
4
+
5
+ class ParagraphCorrector:
6
+ def __init__(self):
7
+ """Initialize correction models with conservative settings"""
8
+ # Grammar tool with increased timeout
9
+ self.grammar_tool = language_tool_python.LanguageTool(
10
+ 'en-US',
11
+ config={'maxTextLength': 100000}
12
+ )
13
+
14
+ # Conservative grammar correction model
15
+ self.grammar_model = pipeline(
16
+ "text2text-generation",
17
+ model="vennify/t5-base-grammar-correction",
18
+ device=0 if torch.cuda.is_available() else -1
19
+ )
20
+
21
+ def correct_sentence(self, sentence: str) -> str:
22
+ """Correct a single sentence conservatively"""
23
+ # Basic grammar/spelling correction
24
+ matches = self.grammar_tool.check(sentence)
25
+ corrected = language_tool_python.utils.correct(sentence, matches)
26
+
27
+ # Light neural correction
28
+ result = self.grammar_model(
29
+ corrected,
30
+ max_length=256,
31
+ num_beams=3,
32
+ temperature=0.3, # Low temperature for minimal changes
33
+ early_stopping=True
34
+ )
35
+ return result[0]['generated_text']
36
+
37
+ def conservative_correction(self, text: str) -> str:
38
+ """Process text while preserving original structure"""
39
+ if not text.strip():
40
+ return text
41
+
42
+ # Split into sentences while preserving delimiters
43
+ sentences = []
44
+ current = ""
45
+ for char in text:
46
+ current += char
47
+ if char in {'.', '!', '?'}:
48
+ sentences.append(current)
49
+ current = ""
50
+ if current:
51
+ sentences.append(current)
52
+
53
+ # Correct each sentence individually
54
+ corrected_sentences = []
55
+ for sentence in sentences:
56
+ if sentence.strip():
57
+ corrected = self.correct_sentence(sentence)
58
+ corrected_sentences.append(corrected)
59
+ else:
60
+ corrected_sentences.append(sentence)
61
+
62
+ return ''.join(corrected_sentences)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Flask==3.0.3
2
+ language-tool-python==2.8
3
+ transformers==4.49.0
4
+ torch==2.6.0