fisherman611 commited on
Commit
53d8833
·
verified ·
1 Parent(s): ce45460

Create set_up.py

Browse files
Files changed (1) hide show
  1. set_up.py +245 -0
set_up.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Vietnamese Legal Chatbot - Setup Script
4
+ This script initializes the RAG system and processes the legal documents.
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ from typing import Dict, Any
10
+ from utils.data_loader import LegalDataLoader
11
+ from main.chatbot import VietnameseLegalRAG
12
+ from config import Config
13
+
14
+ def check_environment():
15
+ """Check if all required environment variables are set"""
16
+ print("Checking environment configuration...")
17
+
18
+ missing_vars = []
19
+
20
+ if not Config.GOOGLE_API_KEY:
21
+ missing_vars.append("GOOGLE_API_KEY")
22
+
23
+ if not Config.QDRANT_URL:
24
+ missing_vars.append("QDRANT_URL")
25
+
26
+ if not Config.QDRANT_API_KEY:
27
+ missing_vars.append("QDRANT_API_KEY")
28
+
29
+ if missing_vars:
30
+ print(f"❌ Missing environment variables: {', '.join(missing_vars)}")
31
+ print("\nPlease set the following environment variables:")
32
+ for var in missing_vars:
33
+ print(f" export {var}=your_value_here")
34
+ print("\nOr create a .env file with these variables.")
35
+ return False
36
+
37
+ print("✅ Environment configuration OK")
38
+ return True
39
+
40
+ def check_data_files():
41
+ """Check if required data files exist"""
42
+ print("Checking data files...")
43
+
44
+ required_files = [
45
+ Config.CORPUS_PATH,
46
+ Config.STOPWORDS_PATH,
47
+ ]
48
+
49
+ missing_files = []
50
+ for file_path in required_files:
51
+ if not os.path.exists(file_path):
52
+ missing_files.append(file_path)
53
+
54
+ if missing_files:
55
+ print(f"❌ Missing data files: {', '.join(missing_files)}")
56
+ return False
57
+
58
+ print("✅ Data files OK")
59
+ return True
60
+
61
+ def setup_rag_system(force_rebuild: bool = False):
62
+ """Setup the RAG system with indices"""
63
+ print("Setting up RAG system...")
64
+
65
+ try:
66
+ # Initialize data loader
67
+ print("Initializing data loader...")
68
+ data_loader = LegalDataLoader()
69
+
70
+ # Load legal documents
71
+ print("Loading legal corpus...")
72
+ legal_docs = data_loader.load_legal_corpus()
73
+
74
+ if not legal_docs:
75
+ print("❌ No legal documents loaded")
76
+ return None
77
+
78
+ # Prepare documents for indexing
79
+ print("Preparing documents for indexing...")
80
+ documents = data_loader.prepare_documents_for_indexing()
81
+
82
+ if not documents:
83
+ print("❌ No documents prepared for indexing")
84
+ return None
85
+
86
+ print(f"📚 Prepared {len(documents)} documents for indexing")
87
+
88
+ # Initialize RAG system with component-by-component error handling
89
+ print("Initializing RAG system components...")
90
+
91
+ try:
92
+ print(" - Initializing text processor...")
93
+ from utils.text_processor import VietnameseTextProcessor
94
+ text_processor = VietnameseTextProcessor()
95
+ print(" ✅ Text processor initialized")
96
+ except Exception as e:
97
+ print(f" ❌ Text processor error: {e}")
98
+ raise
99
+
100
+ try:
101
+ print(" - Initializing vector store...")
102
+ from main.vector_store import QdrantVectorStore
103
+ vector_store = QdrantVectorStore()
104
+ print(" ✅ Vector store initialized")
105
+ except Exception as e:
106
+ print(f" ❌ Vector store error: {e}")
107
+ raise
108
+
109
+ try:
110
+ print(" - Initializing BM25 retriever...")
111
+ from main.bm25_retriever import BM25Retriever
112
+ bm25_retriever = BM25Retriever()
113
+ print(" ✅ BM25 retriever initialized")
114
+ except Exception as e:
115
+ print(f" ❌ BM25 retriever error: {e}")
116
+ raise
117
+
118
+ try:
119
+ print(" - Initializing complete RAG system...")
120
+ rag_system = VietnameseLegalRAG()
121
+ print(" ✅ RAG system initialized")
122
+ except Exception as e:
123
+ print(f" ❌ RAG system initialization error: {e}")
124
+ raise
125
+
126
+ # Setup indices
127
+ print("Building indices (this may take a while)...")
128
+ rag_system.setup_indices(documents, force_rebuild=force_rebuild)
129
+
130
+ print("✅ RAG system setup completed")
131
+ return rag_system
132
+
133
+ except UnicodeDecodeError as e:
134
+ print(f"❌ Encoding error setting up RAG system: {e}")
135
+ print("💡 Try running: python cleanup.py")
136
+ print("💡 Then run setup again: python setup_system.py")
137
+ return None
138
+ except Exception as e:
139
+ print(f"❌ Error setting up RAG system: {e}")
140
+ print("💡 For encoding issues, try: python cleanup.py")
141
+ import traceback
142
+ print("Full error traceback:")
143
+ traceback.print_exc()
144
+ return None
145
+
146
+ def test_system(rag_system):
147
+ """Test the RAG system with sample questions"""
148
+ print("\nTesting RAG system...")
149
+
150
+ test_questions = [
151
+ "Quyền và nghĩa vụ của ngư��i lao động là gì?",
152
+ "Thời gian làm việc theo quy định của pháp luật?",
153
+ "Điều kiện kết hôn theo luật hôn nhân và gia đình?"
154
+ ]
155
+
156
+ for i, question in enumerate(test_questions, 1):
157
+ print(f"\n--- Test {i}: {question} ---")
158
+
159
+ try:
160
+ result = rag_system.answer_question(question, use_fallback=False)
161
+
162
+ print(f"Answer: {result['answer'][:200]}...")
163
+ print(f"Retrieved docs: {len(result['retrieved_documents'])}")
164
+ print(f"Fallback used: {result['fallback_used']}")
165
+
166
+ except Exception as e:
167
+ print(f"Error answering question: {e}")
168
+
169
+ def display_system_status(rag_system):
170
+ """Display system status and statistics"""
171
+ print("\n" + "="*50)
172
+ print("SYSTEM STATUS")
173
+ print("="*50)
174
+
175
+ status = rag_system.get_system_status()
176
+
177
+ print(f"🤖 LLM Available: {'✅' if status['llm_available'] else '❌'}")
178
+ print(f"🔍 Vector Store: {'✅' if status['vector_store_available'] else '❌'}")
179
+ print(f"📊 BM25 Retriever: {'✅' if status['bm25_available'] else '❌'}")
180
+ print(f"🔑 Google API: {'✅' if status['google_api_configured'] else '❌'}")
181
+ print(f"☁️ QDrant Cloud: {'✅' if status['qdrant_configured'] else '❌'}")
182
+
183
+ # Vector store info
184
+ if 'vector_store_info' in status and status['vector_store_info']:
185
+ info = status['vector_store_info']
186
+ print(f"\n📚 Vector Store Info:")
187
+ print(f" - Collection: {info.get('name', 'N/A')}")
188
+ print(f" - Documents: {info.get('points_count', 0):,}")
189
+ vectors_count = info.get('vectors_count')
190
+ print(f" - Vectors: {vectors_count if vectors_count is not None else 0:,}")
191
+
192
+ # BM25 stats
193
+ if 'bm25_stats' in status and status['bm25_stats']:
194
+ stats = status['bm25_stats']
195
+ print(f"\n📊 BM25 Index Stats:")
196
+ print(f" - Documents: {stats.get('total_documents', 0):,}")
197
+ print(f" - Vocabulary: {stats.get('vocabulary_size', 0):,}")
198
+ print(f" - Avg Doc Length: {stats.get('average_document_length', 0):.1f}")
199
+
200
+ def main():
201
+ """Main setup function"""
202
+ print("🏛️ Vietnamese Legal Chatbot - Setup")
203
+ print("="*50)
204
+
205
+ # Check prerequisites
206
+ if not check_environment():
207
+ print("\n❌ Environment check failed. Please configure your environment variables.")
208
+ sys.exit(1)
209
+
210
+ if not check_data_files():
211
+ print("\n❌ Data file check failed. Please ensure all data files are present.")
212
+ sys.exit(1)
213
+
214
+ # Parse command line arguments
215
+ force_rebuild = "--rebuild" in sys.argv or "-r" in sys.argv
216
+ run_tests = "--test" in sys.argv or "-t" in sys.argv
217
+
218
+ if force_rebuild:
219
+ print("\n🔄 Force rebuild mode enabled")
220
+
221
+ # Setup RAG system
222
+ rag_system = setup_rag_system(force_rebuild=force_rebuild)
223
+
224
+ if not rag_system:
225
+ print("\n❌ RAG system setup failed")
226
+ sys.exit(1)
227
+
228
+ # Display system status
229
+ display_system_status(rag_system)
230
+
231
+ # Run tests if requested
232
+ if run_tests:
233
+ test_system(rag_system)
234
+
235
+ print("\n✅ Setup completed successfully!")
236
+ print("\nYou can now run the Streamlit app:")
237
+ print(" streamlit run app.py")
238
+
239
+ print("\nUsage:")
240
+ print(" python setup_system.py # Normal setup")
241
+ print(" python setup_system.py --rebuild # Force rebuild indices")
242
+ print(" python setup_system.py --test # Run with tests")
243
+
244
+ if __name__ == "__main__":
245
+ main()