Spaces:

JoeArmani
/

csc525_retrieval_based_chatbot

Sleeping

App Files Files Community

JoeArmani commited on Jan 11

Commit

e5be70f

1 Parent(s): 7a0020b

reranker scoring

Browse files

Files changed (5) hide show

chatbot_model.py +92 -143
chatbot_validator.py +47 -40
new_iteration/run_taskmaster_processor.py +1 -1
new_iteration/taskmaster_processor.py +112 -56
validate_model.py +0 -4

chatbot_model.py CHANGED Viewed

@@ -10,7 +10,6 @@ from pathlib import Path
 import datetime
 import faiss
 import gc
 import re
 from tf_data_pipeline import TFDataPipeline
 from response_quality_checker import ResponseQualityChecker
@@ -280,15 +279,6 @@ class RetrievalChatbot(DeviceAwareModel):
         dummy_input = tf.zeros((1, config.max_context_token_limit), dtype=tf.int32)
         _ = chatbot.encoder(dummy_input, training=False)
-        # # Then load your custom weights
-        # custom_weights_path = load_dir / "encoder_custom_weights.weights.h5"
-        # if custom_weights_path.exists():
-        #     logger.info(f"Loading custom top-level weights from {custom_weights_path}")
-        #     chatbot.encoder.load_weights(str(custom_weights_path))
-        #     logger.info("Custom top-level weights loaded successfully.")
-        # else:
-        #     logger.warning(f"Custom weights file not found at {custom_weights_path}.")
         # 4) Load tokenizer
         chatbot.tokenizer = AutoTokenizer.from_pretrained(load_dir / "tokenizer")
@@ -361,7 +351,10 @@ class RetrievalChatbot(DeviceAwareModel):
         self.tokenizer.save_pretrained(save_dir / "tokenizer")
         logger.info(f"Models and tokenizer saved to {save_dir}.")
     def retrieve_responses_cross_encoder(
         self,
         query: str,
@@ -391,31 +384,34 @@ class RetrievalChatbot(DeviceAwareModel):
             logger.info(f"Summarized Query: {query}")
         detected_domain = self.detect_domain_from_query(query)
-        logger.debug(f"Detected domain '{detected_domain}' for query: {query}")
-        # 2) Retrieve more initial candidates from FAISS
         initial_k = min(top_k * 10, len(self.data_pipeline.response_pool))
-        dense_candidates = self.retrieve_responses_faiss(query, domain=detected_domain, top_k=initial_k)
-        boosted_candidates = dense_candidates
-        # 4) If we have a cross-encoder, re-rank these boosted candidates
         if not reranker:
-            logger.warning("No CrossEncoderReranker provided; creating a default one.")
             reranker = CrossEncoderReranker(model_name="cross-encoder/ms-marco-MiniLM-L-12-v2")
-        texts = [item[0] for item in boosted_candidates]
         ce_scores = reranker.rerank(query, texts, max_length=256)
         # Combine cross-encoder score with the base FAISS score (simple multiplicative approach)
         final_candidates = []
-        for (resp_text, faiss_score), ce_score in zip(boosted_candidates, ce_scores):
             # TODO: dial this in.
-            alpha = 0.8
-            combined_score = alpha * ce_score + (1 - alpha) * faiss_score
             length_adjusted_score = self.length_adjust_score(resp_text, combined_score)
             #combined_score = ce_score * faiss_score
-            final_candidates.append((resp_text, combined_score))
         # Sort descending by combined score
         final_candidates.sort(key=lambda x: x[1], reverse=True)
@@ -434,29 +430,34 @@ class RetrievalChatbot(DeviceAwareModel):
     def extract_keywords(self, query: str) -> List[str]:
         """
-        Extract keywords from the query based on DOMAIN_KEYWORDS.
         """
         query_lower = query.lower()
-        keywords = set()
-        for domain, kws in self.DOMAIN_KEYWORDS.items():
-            for kw in kws:
                 if kw in query_lower:
-                    keywords.add(kw)
-        return list(keywords)
-    def length_adjust_score(resp_text: str, base_score: float) -> float:
-        # Apply a short penalty
-        words = len(resp_text.split())
-        if words < 3:
-            # big penalty or skip entirely
-            return base_score * 0.1  # or base_score - 0.01
-        # Add a mild bonus for lines that exceed 12 words:
-        if words > 12:
-            # e.g. +0.002 * (words - 12)
-            bonus = 0.002 * (words - 12)
             base_score += bonus
         return base_score
     def detect_domain_from_query(self, query: str) -> str:
@@ -464,12 +465,12 @@ class RetrievalChatbot(DeviceAwareModel):
         Detect the domain of the query based on keywords.
         """
         domain_patterns = {
-            'restaurant': r'\b(restaurant|dining|food|dine|reservation|table|menu|cuisine|eat|place\s?to\s?eat|hungry|chef|dish|meal|fork|knife|spoon|brunch|bistro|buffet|catering|gourmet|fast\s?food|fine\s?dining|takeaway|delivery|restaurant\s?booking)\b',
-            'movie': r'\b(movie|cinema|film|ticket|showtime|showing|theater|flick|screening|film\s?ticket|film\s?show|blockbuster|premiere|trailer|director|actor|actress|plot|genre|screen|sequel|animation|documentary)\b',
-            'ride_share': r'\b(ride|taxi|uber|lyft|car\s?service|pickup|dropoff|driver|cab|hailing|rideshare|ride\s?hailing|carpool|chauffeur|transit|transportation|hail\s?ride)\b',
-            'coffee': r'\b(coffee|café|cafe|starbucks|espresso|latte|mocha|americano|barista|brew|cappuccino|macchiato|iced\s?coffee|cold\s?brew|espresso\s?machine|coffee\s?shop|tea|chai|java|bean|roast|decaf)\b',
-            'pizza': r'\b(pizza|delivery|order\s?food|pepperoni|topping|pizzeria|slice|pie|margherita|deep\s?dish|thin\s?crust|cheese|oven|tossed|sauce|garlic\s?bread|calzone)\b',
-            'auto': r'\b(car|vehicle|repair|maintenance|mechanic|oil\s?change|garage|auto\s?shop|tire|check\s?engine|battery|transmission|brake|engine\s?diagnostics|carwash|detail|alignment|exhaust|spark\s?plug|dashboard)\b',
         }
         # Check for matches
@@ -479,20 +480,20 @@ class RetrievalChatbot(DeviceAwareModel):
         return 'other'
-    def is_numeric_response(text: str) -> bool:
         """
-        Return True if `text` is purely digits (and/or spaces).
-        e.g.: "4 3 13" -> True, " 42 " -> True, "hello 42" -> False
         """
-        pattern = r'^\s*[0-9]+(\s+[0-9]+)*\s*$'
-        return bool(re.match(pattern, text))
     def retrieve_responses_faiss(
         self,
         query: str,
         domain: str = 'other',
         top_k: int = 5,
-        boost_factor: float = 1.3
     ) -> List[Tuple[str, float]]:
         """
         Retrieve top-k responses from the FAISS index (IndexFlatIP) given a user query.
@@ -511,117 +512,65 @@ class RetrievalChatbot(DeviceAwareModel):
         q_emb_np = q_emb.reshape(1, -1).astype('float32')
         # Search the index
-        distances, indices = self.data_pipeline.index.search(q_emb_np, top_k * 10) # Adjust multiplier as needed
         # IndexFlatIP: 'distances' are inner products (cosine similarities for normalized vectors)
         candidates = []
         for rank, idx in enumerate(indices[0]):
-            if idx == -1:
-                continue  # FAISS may return -1 for invalid indices
             response = self.data_pipeline.response_pool[idx]
-            text = response.get('text', '')
             cand_domain = response.get('domain', 'other')
             score = distances[0][rank]
-            # Filter out numeric responses and very short texts
-            if not self.is_numeric_response(text) and len(text.split()) >= self.config.min_text_length:
-                candidates.append((text, cand_domain, score))
         if not candidates:
-            logger.warning("No valid candidates found after initial filtering.")
             return []
         # Sort candidates by score descending
         candidates.sort(key=lambda x: x[2], reverse=True)
         # Filter in-domain responses
-        if domain != 'other':
-            in_domain_responses = [c for c in candidates if c[1] == domain]
-            if not in_domain_responses:
-                logger.info(f"No in-domain responses found for domain '{domain}'. Falling back to all candidates.")
-                in_domain_responses = candidates
-        else:
-            in_domain_responses = candidates
         # Boost responses containing query keywords
         query_keywords = self.extract_keywords(query)
-        boosted_responses = []
-        for resp_text, resp_domain, score in in_domain_responses:
-            if any(kw in resp_text.lower() for kw in query_keywords):
-                boosted_score = score * boost_factor
-                logger.debug(f"Boosting response: '{resp_text}' by factor {boost_factor}")
-            else:
-                boosted_score = score
-            boosted_responses.append((resp_text, boosted_score))
         # Sort boosted responses
-        boosted_responses.sort(key=lambda x: x[1], reverse=True)
-        # Select top_k responses
-        top_responses = boosted_responses[:top_k]
-        logger.debug(f"Top {top_k} responses selected.")
-        return top_responses
-    # def retrieve_responses_faiss(
-    #     self,
-    #     query: str,
-    #     domain: str = 'other',
-    #     top_k: int = 5,
-    #     boost_factor: float = 1.3
-    # ) -> List[Tuple[str, float]]:
-    #     """
-    #     Retrieve top-k responses from the FAISS index (IndexFlatIP) given a user query.
-    #     Args:
-    #         query: The user input text
-    #         top_k: Number of top results to return
-    #     Returns:
-    #         List of (response_text, similarity) sorted by descending similarity
-    #     """
-    #     # Encode the query
-    #     q_emb = self.data_pipeline.encode_query(query)
-    #     q_emb_np = q_emb.reshape(1, -1).astype('float32')
-    #     # Search the index
-    #     distances, indices = self.data_pipeline.index.search(q_emb_np, top_k * 10) # distances: shape [1, k], indices: shape [1, k]
-    #     # IndexFlatIP: 'distances' are cosine similarities in [-1, +1].
-    #     candidates = []
-    #     for rank, idx in enumerate(indices[0]):
-    #         text = self.response_pool[idx]['text']
-    #         cand_domain = self.response_pool[idx]['domain']
-    #         score = distances[0][rank]
-    #         # filter out responses with only numbers or too few words
-    #         word_count = len(text.split())
-    #         if not self.is_numeric_resonse(text) and word_count >= 2:
-    #             candidates.append((text, cand_domain, score))
-    #     # Filter to in-domain responses
-    #     candidates.sort(key=lambda x: x[2], reverse=True)
-    #     in_domain_responses = [(text, score) for (text, cand_domain, score) in candidates if cand_domain == domain]
-    #     # Boost keyword matching responses
-    #     query_keywords = self.extract_keywords(query)
-    #     boosted_responses = []
-    #     for (resp_text, domain, score) in in_domain_responses:
-    #         # Check if any keyword is present in the response text
-    #         for kw in query_keywords:
-    #             if kw in resp_text.lower():
-    #                 boosted_score = score * boost_factor
-    #                 print(f"Boosting response: '{resp_text}' by factor {boost_factor}")
-    #                 break
-    #         else:
-    #             boosted_score = score
-    #         boosted_responses.append((resp_text, domain, boosted_score))
-    #     # Debug
-    #     logger.debug("\nFAISS Search Results (top 15 for debug):")
-    #     for i, (resp, score) in enumerate(boosted_responses[:15], start=1):
-    #         logger.debug(f"{i}. Score: {score:.4f} -> {resp[:60]}")
-    #     return boosted_responses[:top_k]
     def chat(
         self,

 import datetime
 import faiss
 import gc
 import re
 from tf_data_pipeline import TFDataPipeline
 from response_quality_checker import ResponseQualityChecker
         dummy_input = tf.zeros((1, config.max_context_token_limit), dtype=tf.int32)
         _ = chatbot.encoder(dummy_input, training=False)
         # 4) Load tokenizer
         chatbot.tokenizer = AutoTokenizer.from_pretrained(load_dir / "tokenizer")
         self.tokenizer.save_pretrained(save_dir / "tokenizer")
         logger.info(f"Models and tokenizer saved to {save_dir}.")
+    def sigmoid(self, x: float) -> float:
+        return 1 / (1 + np.exp(-x))
     def retrieve_responses_cross_encoder(
         self,
         query: str,
             logger.info(f"Summarized Query: {query}")
         detected_domain = self.detect_domain_from_query(query)
+        #logger.debug(f"Detected domain '{detected_domain}' for query: {query}")
+        # Retrieve initial candidates from FAISS
         initial_k = min(top_k * 10, len(self.data_pipeline.response_pool))
+        faiss_candidates = self.retrieve_responses_faiss(query, domain=detected_domain, top_k=initial_k)
+        texts = [item[0] for item in faiss_candidates]
+        # Re-rank these boosted candidates
         if not reranker:
             reranker = CrossEncoderReranker(model_name="cross-encoder/ms-marco-MiniLM-L-12-v2")
         ce_scores = reranker.rerank(query, texts, max_length=256)
         # Combine cross-encoder score with the base FAISS score (simple multiplicative approach)
         final_candidates = []
+        for (resp_text, faiss_score), ce_score in zip(faiss_candidates, ce_scores):
             # TODO: dial this in.
+            ce_prob = self.sigmoid(ce_score)  # ~ relevance in [0..1]
+            faiss_norm = (faiss_score + 1)/2.0
+            combined_score = 0.9 * ce_prob + 0.1 * faiss_norm
+            # alpha = 0.9
+            # print(f'CE SCORE: {ce_score} FAISS SCORE: {faiss_score}')
+            # combined_score = alpha * ce_score + (1 - alpha) * faiss_score
             length_adjusted_score = self.length_adjust_score(resp_text, combined_score)
             #combined_score = ce_score * faiss_score
+            #final_candidates.append((resp_text, combined_score))
+            final_candidates.append((resp_text, length_adjusted_score))
         # Sort descending by combined score
         final_candidates.sort(key=lambda x: x[1], reverse=True)
     def extract_keywords(self, query: str) -> List[str]:
         """
+        Return any domain keywords present in the query (lowercased).
         """
         query_lower = query.lower()
+        found = set()
+        for domain, kw_list in self.DOMAIN_KEYWORDS.items():
+            for kw in kw_list:
                 if kw in query_lower:
+                    found.add(kw)
+        return list(found)
+    def length_adjust_score(self, text: str, base_score: float) -> float:
+        """
+        Penalize very short lines or numeric lines; mildly reward longer lines.
+        Adjust carefully so you don't overshadow cross-encoder signals.
+        """
+        words = text.split()
+        wcount = len(words)
+        # Penalty if under 3 words
+        if wcount < 4:
+            return base_score * 0.8
+        # Bonus for lines > 12 words
+        if wcount > 12:
+            extra = min(wcount - 12, 8)
+            bonus = 0.0005 * extra
             base_score += bonus
         return base_score
     def detect_domain_from_query(self, query: str) -> str:
         Detect the domain of the query based on keywords.
         """
         domain_patterns = {
+            'restaurant': r'\b(restaurant|restaurants?|dining|food|foods?|dine|reservation|reservations?|table|tables?|menu|menus?|cuisine|cuisines?|eat|eats?|place\s?to\s?eat|places\s?to\s?eat|hungry|chef|chefs?|dish|dishes?|meal|meals?|fork|forks?|knife|knives?|spoon|spoons?|brunch|bistro|buffet|buffets?|catering|caterings?|gourmet|fast\s?food|fine\s?dining|takeaway|takeaways?|delivery|deliveries|restaurant\s?booking)\b',
+            'movie': r'\b(movie|movies?|cinema|cinemas?|film|films?|ticket|tickets?|showtime|showtimes?|showing|showings?|theater|theaters?|flick|flicks?|screening|screenings?|film\s?ticket|film\s?tickets?|film\s?show|film\s?shows?|blockbuster|blockbusters?|premiere|premieres?|trailer|trailers?|director|directors?|actor|actors?|actress|actresses?|plot|plots?|genre|genres?|screen|screens?|sequel|sequels?|animation|animations?|documentary|documentaries)\b',
+            'ride_share': r'\b(ride|rides?|taxi|taxis?|uber|lyft|car\s?service|car\s?services?|pickup|pickups?|dropoff|dropoffs?|driver|drivers?|cab|cabs?|hailing|hailings?|rideshare|rideshares?|ride\s?hailing|ride\s?hailings?|carpool|carpools?|chauffeur|chauffeurs?|transit|transits?|transportation|transportations?|hail\s?ride|hail\s?rides?)\b',
+            'coffee': r'\b(coffee|coffees?|café|cafés?|cafe|cafes?|starbucks|espresso|espressos?|latte|lattes?|mocha|mochas?|americano|americanos?|barista|baristas?|brew|brews?|cappuccino|cappuccinos?|macchiato|macchiatos?|iced\s?coffee|iced\s?coffees?|cold\s?brew|cold\s?brews?|espresso\s?machine|espresso\s?machines?|coffee\s?shop|coffee\s?shops?|tea|teas?|chai|chais?|java|javas?|bean|beans?|roast|roasts?|decaf)\b',
+            'pizza': r'\b(pizza|pizzas?|delivery|deliveries|order\s?food|order\s?foods?|pepperoni|pepperonis?|topping|toppings?|pizzeria|pizzerias?|slice|slices?|pie|pies?|margherita|margheritas?|deep\s?dish|deep\s?dishes?|thin\s?crust|thin\s?crusts?|cheese|cheeses?|oven|ovens?|tossed|tosses?|sauce|sauces?|garlic\s?bread|garlic\s?breads?|calzone|calzones?)\b',
+            'auto': r'\b(car|cars?|vehicle|vehicles?|repair|repairs?|maintenance|maintenances?|mechanic|mechanics?|oil\s?change|oil\s?changes?|garage|garages?|auto\s?shop|auto\s?shops?|tire|tires?|check\s?engine|check\s?engines?|battery|batteries?|transmission|transmissions?|brake|brakes?|engine\s?diagnostics|engine\s?diagnostic|carwash|carwashes?|detail|details?|alignment|alignments?|exhaust|exhausts?|spark\s?plug|spark\s?plugs?|dashboard|dashboards?)\b',
         }
         # Check for matches
         return 'other'
+    def is_numeric_response(self, text: str) -> bool:
         """
+        Return True if `text` is purely digits (and/or spaces),
+        with optional punctuation like '.' at the end.
         """
+        pattern = r'^[\s]*[\d]+([\s.,\d]+)*[\s]*$'
+        return bool(re.match(pattern, text.strip()))
     def retrieve_responses_faiss(
         self,
         query: str,
         domain: str = 'other',
         top_k: int = 5,
+        boost_factor: float = 1.05
     ) -> List[Tuple[str, float]]:
         """
         Retrieve top-k responses from the FAISS index (IndexFlatIP) given a user query.
         q_emb_np = q_emb.reshape(1, -1).astype('float32')
         # Search the index
+        distances, indices = self.data_pipeline.index.search(q_emb_np, top_k * 10)
         # IndexFlatIP: 'distances' are inner products (cosine similarities for normalized vectors)
         candidates = []
         for rank, idx in enumerate(indices[0]):
+            if idx < 0:
+                continue
             response = self.data_pipeline.response_pool[idx]
+            text = response.get('text', '').strip()
             cand_domain = response.get('domain', 'other')
             score = distances[0][rank]
+            # Skip purely numeric or extremely short text (fewer than 3 words):
+            words = text.split()
+            if len(words) < 4:
+                continue
+            if self.is_numeric_response(text):
+                continue
+            candidates.append((text, cand_domain, score))
         if not candidates:
+            logger.warning("No valid candidates found after initial numeric/length filtering.")
             return []
         # Sort candidates by score descending
         candidates.sort(key=lambda x: x[2], reverse=True)
         # Filter in-domain responses
+        in_domain = [c for c in candidates if c[1] == domain]
+        if not in_domain:
+            logger.info(f"No in-domain responses found for '{domain}'. Using all candidates.")
+            in_domain = candidates
         # Boost responses containing query keywords
         query_keywords = self.extract_keywords(query)
+        boosted = []
+        for (resp_text, resp_domain, score) in in_domain:
+            new_score = score
+            # If the domain is known AND the response text
+            # shares any query keywords, apply a small boost
+            if query_keywords and any(kw in resp_text.lower() for kw in query_keywords):
+                new_score *= boost_factor
+                #logger.debug(f"Boosting response: '{resp_text}' by factor {boost_factor}")
+            # Apply length penalty/bonus
+            new_score = self.length_adjust_score(resp_text, new_score)
+            boosted.append((resp_text, new_score))
         # Sort boosted responses
+        boosted.sort(key=lambda x: x[1], reverse=True)
+        # Print top 10
+        for resp, score in boosted[:100]:
+            logger.debug(f"Candidate: '{resp}' with score {score}")
+        # 8) Return top_k
+        return boosted[:top_k]
     def chat(
         self,

chatbot_validator.py CHANGED Viewed

@@ -31,41 +31,48 @@ class ChatbotValidator:
         # Basic domain-specific test queries (easy examples)
         # Taskmaster-1 and Schema-Guided style
         self.domain_queries = {
-            'restaurant': [
-                "I'd like to make a reservation for dinner tonight.",
-                "Can you book a table for 4 at an Italian restaurant?",
-                "Is there any availability to dine tomorrow at 7pm?",
-                "I'd like to cancel my reservation for tonight.",
-                "What's the wait time for a table right now?"
             ],
-            'movie_tickets': [
-                "I want to buy tickets for the new Marvel movie.",
-                "Are there any showings of Avatar after 6pm?",
-                "Can I get 3 tickets for the 8pm show?",
-                "What movies are playing this weekend?",
-                "Do you have any matinee showtimes available?"
-            ],
-            'rideshare': [
-                "I need a ride from the airport to downtown.",
-                "How much would it cost to get to the mall?",
-                "Can you book a car for tomorrow morning?",
-                "Is there a driver available right now?",
-                "What's the estimated arrival time for the driver?"
-            ],
-            'services': [
-                "I need to schedule an oil change for my car.",
-                "When can I bring my car in for maintenance?",
-                "Do you have any openings for auto repair today?",
-                "How long will the service take?",
-                "Can I get an estimate for brake repair?"
-            ],
-            'events': [
-                "I need tickets to the concert this weekend.",
-                "What events are happening near me?",
-                "Can I book seats for the basketball game?",
-                "Are there any comedy shows tonight?",
-                "How much are tickets to the theater?"
-            ]
         }
     def run_validation(
@@ -237,13 +244,13 @@ class ChatbotValidator:
         is_confident = metrics.get('is_confident', False)
         logger.info(f"Domain: {domain} | Confidence: {'Yes' if is_confident else 'No'}")
-        logger.info("Quality Metrics:")
-        for k, v in metrics.items():
-            if isinstance(v, (int, float)):
-                logger.info(f"  {k}: {v:.4f}")
-        logger.info("Top 3 Responses:")
-        for i, (resp_text, score) in enumerate(responses[:3], 1):
             logger.info(f"{i}) Score: {score:.4f} | {resp_text}")
             if i == 1 and not is_confident:
                 logger.info("   [Low Confidence on Top Response]")

         # Basic domain-specific test queries (easy examples)
         # Taskmaster-1 and Schema-Guided style
         self.domain_queries = {
+            # 'restaurant': [
+            #     "Hi, I have a question about your restaurant. Do they take reservations?",
+            #     "I'd like to make a reservation for dinner tonight after 6pm. Is that time available?",
+            #     "Can you recommend an Italian restaurant with wood-fired pizza?",
+            #     "Is there parking available if we dine at your restaurant tomorrow evening?",
+            #     "What's the average cost per plate at your restaurant?"
+            # # ],
+            'movie': [
+                "How much are movie tickets for two people?",
+                "I'm looking for showings after 6pm?",
+                "Is this at the new theater with reclining seats?",
+                "Hi, I'm thinking about reserving tickets for the new movie.",
+                "What is the price for your largest popcorn?"
             ],
+            # 'ride_share': [
+            #     "I need a ride from the airport to downtown.",
+            #     "How much would it cost to get to the mall?",
+            #     "Can you book a car for tomorrow morning?",
+            #     "Is there a driver available right now?",
+            #     "What's the estimated arrival time for the driver?"
+            # ],
+            # 'coffee': [
+            #     "Can I get a latte with almond milk?",
+            #     "Can I get a cappuccino with oat milk?",
+            #     "Can I get a mocha with coconut milk?",
+            #     "Can I get a cappuccino with almond milk?",
+            #     "Can I get a mocha with oat milk?",
+            # ],
+            # 'pizza': [
+            #     "Can I get a pizza with extra cheese?",
+            #     "Can I get a pizza with mushrooms?",
+            #     "Can I get a pizza with bell peppers?",
+            #     "Can I get a pizza with onions?",
+            #     "Can I get a pizza with olives?"
+            # ],
+            # 'auto': [
+            #     "I need to schedule an oil change for my car.",
+            #     "When can I bring my car in for maintenance?",
+            #     "Do you have any openings for auto repair today?",
+            #     "How long will the service take?",
+            #     "Can I get an estimate for brake repair?"
+            #],
         }
     def run_validation(
         is_confident = metrics.get('is_confident', False)
         logger.info(f"Domain: {domain} | Confidence: {'Yes' if is_confident else 'No'}")
+        # logger.info("Quality Metrics:")
+        # for k, v in metrics.items():
+        #     if isinstance(v, (int, float)):
+        #         logger.info(f"  {k}: {v:.4f}")
+        logger.info("Top 10 Responses:")
+        for i, (resp_text, score) in enumerate(responses[:10], 1):
             logger.info(f"{i}) Score: {score:.4f} | {resp_text}")
             if i == 1 and not is_confident:
                 logger.info("   [Low Confidence on Top Response]")

new_iteration/run_taskmaster_processor.py CHANGED Viewed

@@ -9,7 +9,7 @@ def main():
     # 1) Setup config
     config = PipelineConfig(
         max_length=512,
-        min_turns=3,
         min_user_words=3,
         debug=True
     )

     # 1) Setup config
     config = PipelineConfig(
         max_length=512,
+        min_turns=4,
         min_user_words=3,
         debug=True
     )

new_iteration/taskmaster_processor.py CHANGED Viewed

@@ -1,33 +1,52 @@
-import json
 import re
 from pathlib import Path
-from typing import List, Dict, Any, Optional
 from dataclasses import dataclass, field
-from pipeline_config import PipelineConfig
 @dataclass
 class TaskmasterDialogue:
-    """Structured representation of a Taskmaster-1 dialogue."""
     conversation_id: str
     instruction_id: Optional[str]
     scenario: Optional[str]
-    domain: str
-    turns: List[Dict[str, Any]] = field(default_factory=list)
     def validate(self) -> bool:
-        """Check if this dialogue has an ID and a list of turns."""
         return bool(self.conversation_id and isinstance(self.turns, list))
 class TaskmasterProcessor:
     """
-    Loads Taskmaster-1 dialogues, extracts domain from scenario,
-    filters them, and outputs a final pipeline-friendly format.
     """
     def __init__(self, config: PipelineConfig):
         self.config = config
-    def load_taskmaster_dataset(self, base_dir: str, max_examples: Optional[int] = None) -> List[TaskmasterDialogue]:
         """
         Load and parse Taskmaster JSON for self-dialogs & woz-dialogs (Taskmaster-1).
         Combines scenario text + conversation utterances to detect domain more robustly.
@@ -35,14 +54,14 @@ class TaskmasterProcessor:
         required_files = {
             "self-dialogs": "self-dialogs.json",
             "woz-dialogs": "woz-dialogs.json",
-            "ontology": "ontology.json",  # we might not actively use this, but let's expect it
         }
-        # Check for missing
         missing = [k for k, v in required_files.items() if not Path(base_dir, v).exists()]
         if missing:
             raise FileNotFoundError(f"Missing Taskmaster files: {missing}")
-        # Load ontology (optional usage)
         ontology_path = Path(base_dir, required_files["ontology"])
         with open(ontology_path, 'r', encoding='utf-8') as f:
             ontology = json.load(f)
@@ -51,7 +70,6 @@ class TaskmasterProcessor:
         dialogues: List[TaskmasterDialogue] = []
-        # We'll read the 2 main files
         file_keys = ["self-dialogs", "woz-dialogs"]
         for file_key in file_keys:
             file_path = Path(base_dir, required_files[file_key])
@@ -61,26 +79,23 @@ class TaskmasterProcessor:
             for d in raw_data:
                 conversation_id = d.get("conversation_id", "")
                 instruction_id = d.get("instruction_id", None)
-                scenario_text = d.get("scenario", "")  # old scenario approach
-                # Collect utterances -> turns
                 utterances = d.get("utterances", [])
                 turns = self._process_utterances(utterances)
-                # Instead of only using scenario_text, we combine scenario + turn texts.
-                # We'll pass everything to _extract_domain
-                domain = self._extract_domain(
-                    scenario_text,
-                    turns  # pass the entire turn list so we can pick up domain keywords
-                )
-                # Create a structured object
                 new_dlg = TaskmasterDialogue(
                     conversation_id=conversation_id,
                     instruction_id=instruction_id,
                     scenario=scenario_text,
                     domain=domain,
-                    turns=turns
                 )
                 dialogues.append(new_dlg)
@@ -93,85 +108,126 @@ class TaskmasterProcessor:
     def _extract_domain(self, scenario: str, turns: List[Dict[str, str]]) -> str:
         """
-        Combine scenario text + all turn texts to detect the domain more robustly.
         """
-        # 1) Combine scenario + conversation text
         combined_text = scenario.lower()
         for turn in turns:
-            text = turn.get('text', '').strip().lower()
-            combined_text += " " + text
-        # 2) Expanded domain patterns (edit or expand as you wish)
         domain_patterns = {
-            'restaurant': r'\b(restaurant|dining|food|reservation|table|menu|cuisine|eat)\b',
-            'movie': r'\b(movie|cinema|film|ticket|showtime|theater)\b',
-            'ride_share': r'\b(ride|taxi|uber|lyft|car\s?service|pickup|dropoff)\b',
             'coffee': r'\b(coffee|café|cafe|starbucks|espresso|latte|mocha|americano)\b',
-            'pizza': r'\b(pizza|delivery|order\s?food|pepperoni|topping|pizzeria)\b',
             'auto': r'\b(car|vehicle|repair|maintenance|mechanic|oil\s?change)\b'
         }
-        # 3) Return first matched domain or 'other'
         for dom, pattern in domain_patterns.items():
             if re.search(pattern, combined_text):
-                print(f"Matched domain: {dom}")
                 return dom
-        print("No domain match, returning 'other'")
         return 'other'
     def _process_utterances(self, utterances: List[Dict[str, Any]]) -> List[Dict[str, str]]:
-        """Map speaker to user/assistant, store text."""
-        turns = []
         for utt in utterances:
             speaker = 'assistant' if utt.get('speaker') == 'ASSISTANT' else 'user'
-            text = utt.get('text', '').strip()
-            turns.append({
                 'speaker': speaker,
                 'text': text
             })
-        return turns
     def filter_and_convert(self, dialogues: List[TaskmasterDialogue]) -> List[Dict]:
         """
         Filter out dialogues that don't meet min turns / min user words,
-        then convert them to final pipeline dict:
-        {
-          "dialogue_id": "...",
-          "domain": "...",
-          "turns": [
-            {"speaker": "user", "text": "..."},
-            ...
-          ]
-        }
         """
         results = []
         for dlg in dialogues:
             if not dlg.validate():
                 continue
             if len(dlg.turns) < self.config.min_turns:
                 continue
             # Check user-turn min words
             keep = True
             for turn in dlg.turns:
                 if turn['speaker'] == 'user':
-                    word_count = len(turn['text'].split())
-                    if word_count < self.config.min_user_words:
                         keep = False
                         break
             if not keep:
                 continue
             pipeline_dlg = {
                 'dialogue_id': dlg.conversation_id,
                 'domain': dlg.domain,
-                'turns': dlg.turns  # or you can refine further if needed
             }
             results.append(pipeline_dlg)
         if self.config.debug:
-            print(f"[TaskmasterProcessor] Filtered down to {len(results)} dialogues.")
         return results

+import os
 import re
+import json
 from pathlib import Path
+from typing import List, Dict, Optional, Any
 from dataclasses import dataclass, field
 @dataclass
 class TaskmasterDialogue:
     conversation_id: str
     instruction_id: Optional[str]
     scenario: Optional[str]
+    domain: Optional[str]
+    turns: List[Dict[str, Any]]
+    original_metadata: Dict[str, Any] = field(default_factory=dict)
+    def __str__(self):
+        return f"TaskmasterDialogue(conversation_id={self.conversation_id}, turns={len(self.turns)} turns)"
     def validate(self) -> bool:
         return bool(self.conversation_id and isinstance(self.turns, list))
+class PipelineConfig:
+    """
+    Example config structure. Adjust to your real config usage.
+    """
+    def __init__(
+        self,
+        debug: bool = True,
+        min_turns: int = 2,
+        min_user_words: int = 3
+    ):
+        self.debug = debug
+        self.min_turns = min_turns
+        self.min_user_words = min_user_words
 class TaskmasterProcessor:
     """
+    Loads Taskmaster-1 dialogues, extracts domain from scenario,
+    cleans + filters them, and outputs a pipeline-friendly format.
     """
     def __init__(self, config: PipelineConfig):
         self.config = config
+    def load_taskmaster_dataset(
+        self,
+        base_dir: str,
+        max_examples: Optional[int] = None
+    ) -> List[TaskmasterDialogue]:
         """
         Load and parse Taskmaster JSON for self-dialogs & woz-dialogs (Taskmaster-1).
         Combines scenario text + conversation utterances to detect domain more robustly.
         required_files = {
             "self-dialogs": "self-dialogs.json",
             "woz-dialogs": "woz-dialogs.json",
+            "ontology": "ontology.json",  # we might not actively use it, but let's expect it
         }
+        # 1) Check for missing
         missing = [k for k, v in required_files.items() if not Path(base_dir, v).exists()]
         if missing:
             raise FileNotFoundError(f"Missing Taskmaster files: {missing}")
+        # 2) Optionally load ontology
         ontology_path = Path(base_dir, required_files["ontology"])
         with open(ontology_path, 'r', encoding='utf-8') as f:
             ontology = json.load(f)
         dialogues: List[TaskmasterDialogue] = []
         file_keys = ["self-dialogs", "woz-dialogs"]
         for file_key in file_keys:
             file_path = Path(base_dir, required_files[file_key])
             for d in raw_data:
                 conversation_id = d.get("conversation_id", "")
                 instruction_id = d.get("instruction_id", None)
+                scenario_text = d.get("scenario", "")
+                # 3) Convert raw utterances
                 utterances = d.get("utterances", [])
                 turns = self._process_utterances(utterances)
+                # 4) Domain detection
+                domain = self._extract_domain(scenario_text, turns)
+                # 5) Build the structured object
                 new_dlg = TaskmasterDialogue(
                     conversation_id=conversation_id,
                     instruction_id=instruction_id,
                     scenario=scenario_text,
                     domain=domain,
+                    turns=turns,
+                    original_metadata={}
                 )
                 dialogues.append(new_dlg)
     def _extract_domain(self, scenario: str, turns: List[Dict[str, str]]) -> str:
         """
+        Combine scenario text + all turn texts to detect domain more robustly.
         """
         combined_text = scenario.lower()
         for turn in turns:
+            txt = turn.get('text', '').lower()
+            combined_text += " " + txt
+        # Expanded domain patterns
         domain_patterns = {
+            'restaurant': r'\b(restaurant|dining|food|reservation|table|menu|cuisine|eat|hungry)\b',
+            'movie': r'\b(movie|cinema|film|ticket|showtime|theater|flick|screening)\b',
+            'ride_share': r'\b(ride|taxi|uber|lyft|car\s?service|pickup|dropoff|driver)\b',
             'coffee': r'\b(coffee|café|cafe|starbucks|espresso|latte|mocha|americano)\b',
+            'pizza': r'\b(pizza|delivery|order\s?food|pepperoni|topping|pizzeria|slice)\b',
             'auto': r'\b(car|vehicle|repair|maintenance|mechanic|oil\s?change)\b'
         }
         for dom, pattern in domain_patterns.items():
             if re.search(pattern, combined_text):
+                # Optional: print if debug
+                if self.config.debug:
+                    print(f"Matched domain: {dom} in scenario/turns")
                 return dom
+        if self.config.debug:
+            print("No domain match, returning 'other'")
         return 'other'
     def _process_utterances(self, utterances: List[Dict[str, Any]]) -> List[Dict[str, str]]:
+        """
+        Convert raw utterances to a cleaned list of (speaker, text).
+        Skip or remove lines that are numeric, too short, or empty.
+        """
+        cleaned_turns = []
         for utt in utterances:
             speaker = 'assistant' if utt.get('speaker') == 'ASSISTANT' else 'user'
+            raw_text = utt.get('text', '').strip()
+            # 1) Optional text cleaning
+            text = self._clean_text(raw_text)
+            # 2) Skip blank or numeric lines
+            if not text:
+                continue
+            if self._is_numeric_line(text):
+                continue
+            # 3) If it's extremely short, skip.
+            # (For example, "ok" or "yes" might be 1-2 words.)
+            if len(text.split()) < 2:
+                # Optionally keep "ok" or "yes" if you'd like, but let's skip them to keep quality up
+                continue
+            # 4) Append
+            cleaned_turns.append({
                 'speaker': speaker,
                 'text': text
             })
+        return cleaned_turns
+    def _clean_text(self, text: str) -> str:
+        """
+        Basic text normalization: remove repeated punctuation, handle weird spacing, etc.
+        Adjust to your needs.
+        """
+        # Example: collapse multiple spaces
+        text = re.sub(r'\s+', ' ', text)
+        # Example: remove trailing punctuation or repeated punctuation
+        # e.g. "Sure!!!" => "Sure!"
+        text = re.sub(r'([!?.,])\1+', r'\1', text)
+        return text.strip()
+    def _is_numeric_line(self, text: str) -> bool:
+        """
+        Return True if line is purely digits/punctuation/spaces,
+        e.g. "4 3 13", "12345", "3.14". Adjust as needed.
+        """
+        pattern = r'^[\s]*[\d]+([\s\d.,]+)*[\s]*$'
+        return bool(re.match(pattern, text))
     def filter_and_convert(self, dialogues: List[TaskmasterDialogue]) -> List[Dict]:
         """
         Filter out dialogues that don't meet min turns / min user words,
+        then convert them to final pipeline format:
+            {
+              "dialogue_id": "...",
+              "domain": "...",
+              "turns": [ {"speaker": "user", "text": "..."}, ... ]
+            }
         """
         results = []
         for dlg in dialogues:
             if not dlg.validate():
                 continue
+            # If after cleaning, we have too few turns, skip
             if len(dlg.turns) < self.config.min_turns:
                 continue
             # Check user-turn min words
+            # E.g. user must have >= 3 words
             keep = True
             for turn in dlg.turns:
                 if turn['speaker'] == 'user':
+                    words_count = len(turn['text'].split())
+                    if words_count < self.config.min_user_words:
                         keep = False
                         break
             if not keep:
                 continue
             pipeline_dlg = {
                 'dialogue_id': dlg.conversation_id,
                 'domain': dlg.domain,
+                'turns': dlg.turns  # already cleaned
             }
             results.append(pipeline_dlg)
         if self.config.debug:
+            print(f"[TaskmasterProcessor] Filtered down to {len(results)} dialogues after cleaning.")
         return results

validate_model.py CHANGED Viewed

@@ -103,10 +103,6 @@ def validate_chatbot():
             chatbot.data_pipeline.response_pool = json.load(f)
         logger.info(f"Response pool loaded from {RESPONSE_POOL_PATH}.")
-        print("Sample from response pool (first 10):")
-        for i, response in enumerate(chatbot.data_pipeline.response_pool[:10]):
-            print(f"{i}: {response}")
         print("\nTotal responses in pool:", len(chatbot.data_pipeline.response_pool))
         # Validate dimension consistency

             chatbot.data_pipeline.response_pool = json.load(f)
         logger.info(f"Response pool loaded from {RESPONSE_POOL_PATH}.")
         print("\nTotal responses in pool:", len(chatbot.data_pipeline.response_pool))
         # Validate dimension consistency