Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	
		Vu Minh Chien
		
	commited on
		
		
					Commit 
							
							·
						
						06d9f7d
	
1
								Parent(s):
							
							5a202c5
								
change predict rule
Browse files- Dockerfile +2 -2
- routes/predict.py +118 -109
- validate_optimization.py +2 -2
    	
        Dockerfile
    CHANGED
    
    | @@ -28,8 +28,8 @@ COPY requirements.txt . | |
| 28 | 
             
            RUN --mount=type=secret,id=BITBUCKET_APP_PW,mode=0444,required=true \
         | 
| 29 | 
             
                git clone https://vumichien:$(cat /run/secrets/BITBUCKET_APP_PW)@bitbucket.org/dtm-partners/meisai-check-ai.git && \
         | 
| 30 | 
             
                cd meisai-check-ai && \
         | 
| 31 | 
            -
                git checkout  | 
| 32 | 
            -
                git pull origin  | 
| 33 | 
             
                cd ..
         | 
| 34 |  | 
| 35 | 
             
            # Cài đặt dependencies
         | 
|  | |
| 28 | 
             
            RUN --mount=type=secret,id=BITBUCKET_APP_PW,mode=0444,required=true \
         | 
| 29 | 
             
                git clone https://vumichien:$(cat /run/secrets/BITBUCKET_APP_PW)@bitbucket.org/dtm-partners/meisai-check-ai.git && \
         | 
| 30 | 
             
                cd meisai-check-ai && \
         | 
| 31 | 
            +
                git checkout staging && \
         | 
| 32 | 
            +
                git pull origin staging && \
         | 
| 33 | 
             
                cd ..
         | 
| 34 |  | 
| 35 | 
             
            # Cài đặt dependencies
         | 
    	
        routes/predict.py
    CHANGED
    
    | @@ -21,8 +21,14 @@ from mapping_lib.sub_subject_and_name_data_mapper import SubSubjectAndNameDataMa | |
| 21 | 
             
            from mapping_lib.sub_subject_location_data_mapper import SubSubjectLocationDataMapper
         | 
| 22 | 
             
            from mapping_lib.abstract_similarity_mapper import AbstractSimilarityMapper
         | 
| 23 | 
             
            from mapping_lib.name_and_abstract_mapper import NameAndAbstractDataMapper
         | 
| 24 | 
            -
            from mapping_lib. | 
| 25 | 
            -
            from mapping_lib. | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 26 |  | 
| 27 | 
             
            from config import UPLOAD_DIR, OUTPUT_DIR
         | 
| 28 | 
             
            from models import (
         | 
| @@ -65,6 +71,21 @@ async def predict( | |
| 65 | 
             
                    # Load input data
         | 
| 66 | 
             
                    start_time = time.time()
         | 
| 67 | 
             
                    df_input_data = pd.read_csv(input_file_path)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 68 |  | 
| 69 | 
             
                    # Ensure basic columns exist with default values
         | 
| 70 | 
             
                    basic_columns = {
         | 
| @@ -83,9 +104,8 @@ async def predict( | |
| 83 | 
             
                        if col not in df_input_data.columns:
         | 
| 84 | 
             
                            df_input_data[col] = default_value
         | 
| 85 |  | 
| 86 | 
            -
                    #  | 
| 87 | 
             
                    try:
         | 
| 88 | 
            -
                        # Subject mapping
         | 
| 89 | 
             
                        if sentence_service.df_subject_map_data is not None:
         | 
| 90 | 
             
                            subject_similarity_mapper = SubjectSimilarityMapper(
         | 
| 91 | 
             
                                cached_embedding_helper=sentence_service.subject_cached_embedding_helper,
         | 
| @@ -93,35 +113,29 @@ async def predict( | |
| 93 | 
             
                            )
         | 
| 94 |  | 
| 95 | 
             
                            list_input_subject = df_input_data["科目"].unique()
         | 
| 96 | 
            -
                            df_subject_data = pd.DataFrame( | 
| 97 |  | 
| 98 | 
            -
                            subject_similarity_mapper. | 
| 99 |  | 
| 100 | 
            -
                            output_subject_map = dict(
         | 
| 101 | 
            -
             | 
| 102 | 
            -
                             | 
| 103 | 
            -
                            df_input_data | 
| 104 | 
            -
                                output_subject_map
         | 
| 105 | 
            -
                            )
         | 
| 106 | 
            -
                            df_input_data["出力_科目"] = df_input_data["科目"].map(
         | 
| 107 | 
            -
                                output_subject_map
         | 
| 108 | 
            -
                            )
         | 
| 109 |  | 
| 110 | 
             
                    except Exception as e:
         | 
| 111 | 
             
                        print(f"Error processing SubjectSimilarityMapper: {e}")
         | 
| 112 | 
             
                        raise HTTPException(status_code=500, detail=str(e))
         | 
| 113 |  | 
|  | |
| 114 | 
             
                    try:
         | 
| 115 | 
            -
                        # Standard subject mapping
         | 
| 116 | 
             
                        if sentence_service.df_standard_subject_map_data is not None:
         | 
| 117 | 
             
                            standard_subject_data_mapper = StandardSubjectDataMapper(
         | 
| 118 | 
             
                                df_map_data=sentence_service.df_standard_subject_map_data
         | 
| 119 | 
             
                            )
         | 
| 120 | 
             
                            df_output_data = standard_subject_data_mapper.map_data(
         | 
| 121 | 
            -
                                df_input_data=df_input_data,
         | 
| 122 | 
            -
                                input_key_columns=["出力_科目"],
         | 
| 123 | 
            -
                                in_place=True,
         | 
| 124 | 
             
                            )
         | 
|  | |
| 125 | 
             
                        else:
         | 
| 126 | 
             
                            df_output_data = df_input_data.copy()
         | 
| 127 |  | 
| @@ -130,131 +144,127 @@ async def predict( | |
| 130 | 
             
                        # Continue with original data if standard subject mapping fails
         | 
| 131 | 
             
                        df_output_data = df_input_data.copy()
         | 
| 132 |  | 
|  | |
| 133 | 
             
                    try:
         | 
| 134 | 
            -
                        # Sub subject mapping
         | 
| 135 | 
             
                        if sentence_service.df_sub_subject_map_data is not None:
         | 
| 136 | 
             
                            sub_subject_similarity_mapper = SubSubjectSimilarityMapper(
         | 
| 137 | 
             
                                cached_embedding_helper=sentence_service.sub_subject_cached_embedding_helper,
         | 
| 138 | 
             
                                df_map_data=sentence_service.df_sub_subject_map_data,
         | 
| 139 | 
             
                            )
         | 
| 140 | 
            -
                             | 
| 141 | 
            -
                                 | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 142 | 
             
                            )
         | 
| 143 | 
            -
                            df_output_data | 
| 144 |  | 
| 145 | 
             
                    except Exception as e:
         | 
| 146 | 
             
                        print(f"Error processing SubSubjectSimilarityMapper: {e}")
         | 
| 147 | 
             
                        raise HTTPException(status_code=500, detail=str(e))
         | 
| 148 |  | 
| 149 | 
            -
                    
         | 
| 150 | 
             
                    try:
         | 
| 151 | 
            -
                        # Name mapping
         | 
| 152 | 
             
                        if sentence_service.df_name_map_data is not None:
         | 
| 153 | 
             
                            name_sentence_mapper = NameSimilarityMapper(
         | 
| 154 | 
             
                                cached_embedding_helper=sentence_service.name_cached_embedding_helper,
         | 
| 155 | 
             
                                df_map_data=sentence_service.df_name_map_data,
         | 
| 156 | 
             
                            )
         | 
| 157 | 
            -
                            name_sentence_mapper. | 
|  | |
| 158 |  | 
| 159 | 
             
                    except Exception as e:
         | 
| 160 | 
             
                        print(f"Error processing NameSimilarityMapper: {e}")
         | 
| 161 | 
             
                        raise HTTPException(status_code=500, detail=str(e))
         | 
| 162 | 
            -
                    try:
         | 
| 163 | 
            -
                        sub_subject_location_mapper = SubSubjectLocationDataMapper()
         | 
| 164 | 
            -
                        sub_subject_location_mapper.map_location(df_output_data)
         | 
| 165 | 
            -
                    except Exception as e:
         | 
| 166 | 
            -
                        print(f"Error processing SubSubjectLocationDataMapper: {e}")
         | 
| 167 | 
            -
                        raise HTTPException(status_code=500, detail=str(e))
         | 
| 168 |  | 
|  | |
| 169 | 
             
                    try:
         | 
| 170 | 
            -
                        # Sub subject and name mapping
         | 
| 171 | 
             
                        if sentence_service.df_sub_subject_and_name_map_data is not None:
         | 
| 172 | 
            -
                             | 
| 173 | 
             
                                df_map_data=sentence_service.df_sub_subject_and_name_map_data
         | 
| 174 | 
             
                            )
         | 
| 175 | 
            -
                             | 
| 176 |  | 
| 177 | 
             
                    except Exception as e:
         | 
| 178 | 
             
                        print(f"Error processing SubSubjectAndNameDataMapper: {e}")
         | 
| 179 | 
             
                        raise HTTPException(status_code=500, detail=str(e))
         | 
| 180 |  | 
|  | |
| 181 | 
             
                    try:
         | 
| 182 | 
            -
                         | 
| 183 | 
            -
             | 
| 184 | 
            -
             | 
| 185 | 
            -
             | 
| 186 | 
            -
             | 
| 187 | 
            -
             | 
| 188 | 
            -
             | 
| 189 | 
            -
             | 
| 190 | 
            -
             | 
| 191 | 
            -
                             | 
| 192 | 
            -
             | 
| 193 | 
            -
             | 
| 194 | 
            -
                             | 
| 195 | 
            -
             | 
| 196 | 
            -
             | 
| 197 | 
            -
             | 
| 198 | 
            -
             | 
| 199 | 
            -
             | 
| 200 | 
            -
             | 
| 201 | 
            -
             | 
| 202 | 
            -
                            for col in ["標準科目", "摘要グループ", "確定", "摘要", "備考"]:
         | 
| 203 | 
            -
                                if col in df_output_data.columns:
         | 
| 204 | 
            -
                                    df_output_data[col] = df_output_data[col].astype(str).fillna("")
         | 
| 205 |  | 
|  | |
|  | |
|  | |
| 206 | 
             
                            abstract_similarity_mapper = AbstractSimilarityMapper(
         | 
| 207 | 
             
                                cached_embedding_helper=sentence_service.abstract_cached_embedding_helper,
         | 
| 208 | 
             
                                df_map_data=sentence_service.df_abstract_map_data,
         | 
| 209 | 
             
                            )
         | 
| 210 | 
            -
                            abstract_similarity_mapper. | 
| 211 | 
            -
             | 
| 212 | 
            -
                            print(f"DEBUG: AbstractSimilarityMapper completed successfully")
         | 
| 213 |  | 
| 214 | 
             
                    except Exception as e:
         | 
| 215 | 
             
                        print(f"Error processing AbstractSimilarityMapper: {e}")
         | 
| 216 | 
             
                        print(f"DEBUG: Full error traceback:")
         | 
| 217 | 
            -
                        import traceback
         | 
| 218 | 
            -
             | 
| 219 | 
             
                        traceback.print_exc()
         | 
| 220 | 
             
                        # Don't raise the exception, continue processing
         | 
| 221 | 
             
                        print(f"DEBUG: Continuing without AbstractSimilarityMapper...")
         | 
| 222 |  | 
|  | |
| 223 | 
             
                    try:
         | 
| 224 | 
            -
                        # Name and abstract mapping
         | 
| 225 | 
             
                        if sentence_service.df_name_and_subject_map_data is not None:
         | 
| 226 | 
             
                            name_and_abstract_mapper = NameAndAbstractDataMapper(
         | 
| 227 | 
             
                                df_map_data=sentence_service.df_name_and_subject_map_data
         | 
| 228 | 
             
                            )
         | 
| 229 | 
            -
                            df_output_data =  | 
|  | |
|  | |
|  | |
| 230 |  | 
| 231 | 
             
                    except Exception as e:
         | 
| 232 | 
             
                        print(f"Error processing NameAndAbstractDataMapper: {e}")
         | 
| 233 | 
             
                        raise HTTPException(status_code=500, detail=str(e))
         | 
| 234 |  | 
|  | |
| 235 | 
             
                    try:
         | 
| 236 | 
            -
                         | 
| 237 | 
            -
                         | 
| 238 | 
            -
             | 
| 239 | 
            -
                                cached_embedding_helper=sentence_service.unit_cached_embedding_helper,
         | 
| 240 | 
            -
                                df_map_data=sentence_service.df_unit_map_data,
         | 
| 241 | 
            -
                            )
         | 
| 242 | 
            -
                            unit_mapper.predict_input_optimized(df_input_data=df_output_data)
         | 
| 243 | 
            -
             | 
| 244 | 
            -
                    except Exception as e:
         | 
| 245 | 
            -
                        print(f"Error processing UnitMapper: {e}")
         | 
| 246 | 
            -
                        raise HTTPException(status_code=500, detail=str(e))
         | 
| 247 | 
            -
             | 
| 248 | 
            -
                    try:
         | 
| 249 | 
            -
                        # Standard name mapping
         | 
| 250 | 
            -
                        if sentence_service.df_standard_name_map_data is not None:
         | 
| 251 | 
            -
                            standard_name_mapper = StandardNameMapper(
         | 
| 252 | 
            -
                                df_map_data=sentence_service.df_standard_name_map_data
         | 
| 253 | 
            -
                            )
         | 
| 254 | 
            -
                            df_output_data = standard_name_mapper.map_data(df_output_data)
         | 
| 255 | 
            -
             | 
| 256 | 
             
                    except Exception as e:
         | 
| 257 | 
            -
                        print(f"Error processing  | 
| 258 | 
             
                        raise HTTPException(status_code=500, detail=str(e))
         | 
| 259 |  | 
| 260 | 
             
                    # Create output columns and ensure they have proper values
         | 
| @@ -286,7 +296,6 @@ async def predict( | |
| 286 | 
             
                    for col, default_value in required_columns.items():
         | 
| 287 | 
             
                        if col not in df_output_data.columns:
         | 
| 288 | 
             
                            df_output_data[col] = default_value
         | 
| 289 | 
            -
             | 
| 290 | 
             
                    # Map output columns to match Excel structure
         | 
| 291 | 
             
                    # 出力_中科目 mapping - use the standard sub-subject from sub-subject mapper
         | 
| 292 | 
             
                    if "出力_中科目" in df_output_data.columns:
         | 
| @@ -331,26 +340,26 @@ async def predict( | |
| 331 | 
             
                    print(f"Available columns after processing: {list(df_output_data.columns)}")
         | 
| 332 |  | 
| 333 | 
             
                    # Final check and fallback for missing output columns
         | 
| 334 | 
            -
                    if (
         | 
| 335 | 
            -
             | 
| 336 | 
            -
             | 
| 337 | 
            -
                    ):
         | 
| 338 | 
            -
             | 
| 339 | 
            -
             | 
| 340 | 
            -
                    if (
         | 
| 341 | 
            -
             | 
| 342 | 
            -
             | 
| 343 | 
            -
                    ):
         | 
| 344 | 
            -
             | 
| 345 | 
            -
             | 
| 346 | 
            -
                    if (
         | 
| 347 | 
            -
             | 
| 348 | 
            -
             | 
| 349 | 
            -
                    ):
         | 
| 350 | 
            -
             | 
| 351 | 
            -
             | 
| 352 | 
            -
                    if "出力_確率度" not in df_output_data.columns:
         | 
| 353 | 
            -
             | 
| 354 |  | 
| 355 | 
             
                    # Define output columns in exact order as shown in Excel
         | 
| 356 | 
             
                    output_columns = [
         | 
| @@ -511,14 +520,14 @@ async def predict_raw( | |
| 511 | 
             
                    try:
         | 
| 512 | 
             
                        # Unit mapping
         | 
| 513 | 
             
                        if sentence_service.df_unit_map_data is not None:
         | 
| 514 | 
            -
                            unit_mapper =  | 
| 515 | 
             
                                cached_embedding_helper=sentence_service.unit_cached_embedding_helper,
         | 
| 516 | 
             
                                df_map_data=sentence_service.df_unit_map_data,
         | 
| 517 | 
             
                            )
         | 
| 518 | 
             
                            unit_mapper.predict_input(df_input_data=df_input_data)
         | 
| 519 |  | 
| 520 | 
             
                    except Exception as e:
         | 
| 521 | 
            -
                        print(f"Error processing  | 
| 522 | 
             
                        raise HTTPException(status_code=500, detail=str(e))
         | 
| 523 |  | 
| 524 | 
             
                    # Ensure required columns exist
         | 
|  | |
| 21 | 
             
            from mapping_lib.sub_subject_location_data_mapper import SubSubjectLocationDataMapper
         | 
| 22 | 
             
            from mapping_lib.abstract_similarity_mapper import AbstractSimilarityMapper
         | 
| 23 | 
             
            from mapping_lib.name_and_abstract_mapper import NameAndAbstractDataMapper
         | 
| 24 | 
            +
            from mapping_lib.unit_mapper import UnitMapper
         | 
| 25 | 
            +
            from mapping_lib.base_dictionary_mapper import BaseDictionaryMapper
         | 
| 26 | 
            +
            from common_lib.data_utilities import fillna_with_space
         | 
| 27 | 
            +
            from common_lib.string_utilities import (
         | 
| 28 | 
            +
                preprocess_text,
         | 
| 29 | 
            +
                ConversionType,
         | 
| 30 | 
            +
                ConversionSettings,
         | 
| 31 | 
            +
            )
         | 
| 32 |  | 
| 33 | 
             
            from config import UPLOAD_DIR, OUTPUT_DIR
         | 
| 34 | 
             
            from models import (
         | 
|  | |
| 71 | 
             
                    # Load input data
         | 
| 72 | 
             
                    start_time = time.time()
         | 
| 73 | 
             
                    df_input_data = pd.read_csv(input_file_path)
         | 
| 74 | 
            +
                    
         | 
| 75 | 
            +
                    # Preprocess data like in meisai-check-ai/predict.py
         | 
| 76 | 
            +
                    df_input_data["元名称"] = df_input_data["名称"]
         | 
| 77 | 
            +
                    df_input_data["名称"] = df_input_data["名称"].apply(
         | 
| 78 | 
            +
                        lambda x: (
         | 
| 79 | 
            +
                            preprocess_text(
         | 
| 80 | 
            +
                                x,
         | 
| 81 | 
            +
                                convert_kana=ConversionType.Z2H,
         | 
| 82 | 
            +
                                convert_alphabet=ConversionType.Z2H,
         | 
| 83 | 
            +
                                convert_digit=ConversionType.Z2H,
         | 
| 84 | 
            +
                            )
         | 
| 85 | 
            +
                            if pd.notna(x)
         | 
| 86 | 
            +
                            else ""
         | 
| 87 | 
            +
                        )
         | 
| 88 | 
            +
                    )
         | 
| 89 |  | 
| 90 | 
             
                    # Ensure basic columns exist with default values
         | 
| 91 | 
             
                    basic_columns = {
         | 
|  | |
| 104 | 
             
                        if col not in df_input_data.columns:
         | 
| 105 | 
             
                            df_input_data[col] = default_value
         | 
| 106 |  | 
| 107 | 
            +
                    # SubjectSimilarityMapper
         | 
| 108 | 
             
                    try:
         | 
|  | |
| 109 | 
             
                        if sentence_service.df_subject_map_data is not None:
         | 
| 110 | 
             
                            subject_similarity_mapper = SubjectSimilarityMapper(
         | 
| 111 | 
             
                                cached_embedding_helper=sentence_service.subject_cached_embedding_helper,
         | 
|  | |
| 113 | 
             
                            )
         | 
| 114 |  | 
| 115 | 
             
                            list_input_subject = df_input_data["科目"].unique()
         | 
| 116 | 
            +
                            df_subject_data = pd.DataFrame(list_input_subject, columns=["科目"])
         | 
| 117 |  | 
| 118 | 
            +
                            subject_similarity_mapper.predict_input(df_input_data=df_subject_data)
         | 
| 119 |  | 
| 120 | 
            +
                            output_subject_map = dict(zip(df_subject_data["科目"], df_subject_data["出力_科目"]))
         | 
| 121 | 
            +
                            df_input_data["標準科目"] = df_input_data["科目"].map(output_subject_map)
         | 
| 122 | 
            +
                            df_input_data["出力_科目"] = df_input_data["標準科目"]
         | 
| 123 | 
            +
                            fillna_with_space(df_input_data)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 124 |  | 
| 125 | 
             
                    except Exception as e:
         | 
| 126 | 
             
                        print(f"Error processing SubjectSimilarityMapper: {e}")
         | 
| 127 | 
             
                        raise HTTPException(status_code=500, detail=str(e))
         | 
| 128 |  | 
| 129 | 
            +
                    # StandardSubjectDataMapper
         | 
| 130 | 
             
                    try:
         | 
|  | |
| 131 | 
             
                        if sentence_service.df_standard_subject_map_data is not None:
         | 
| 132 | 
             
                            standard_subject_data_mapper = StandardSubjectDataMapper(
         | 
| 133 | 
             
                                df_map_data=sentence_service.df_standard_subject_map_data
         | 
| 134 | 
             
                            )
         | 
| 135 | 
             
                            df_output_data = standard_subject_data_mapper.map_data(
         | 
| 136 | 
            +
                                df_input_data=df_input_data, input_key_columns=["出力_科目"], in_place=True
         | 
|  | |
|  | |
| 137 | 
             
                            )
         | 
| 138 | 
            +
                            fillna_with_space(df_output_data)
         | 
| 139 | 
             
                        else:
         | 
| 140 | 
             
                            df_output_data = df_input_data.copy()
         | 
| 141 |  | 
|  | |
| 144 | 
             
                        # Continue with original data if standard subject mapping fails
         | 
| 145 | 
             
                        df_output_data = df_input_data.copy()
         | 
| 146 |  | 
| 147 | 
            +
                    # SubSubjectSimilarityMapper
         | 
| 148 | 
             
                    try:
         | 
|  | |
| 149 | 
             
                        if sentence_service.df_sub_subject_map_data is not None:
         | 
| 150 | 
             
                            sub_subject_similarity_mapper = SubSubjectSimilarityMapper(
         | 
| 151 | 
             
                                cached_embedding_helper=sentence_service.sub_subject_cached_embedding_helper,
         | 
| 152 | 
             
                                df_map_data=sentence_service.df_sub_subject_map_data,
         | 
| 153 | 
             
                            )
         | 
| 154 | 
            +
                            df_input_sub_subject = df_output_data[
         | 
| 155 | 
            +
                                ["科目", "標準科目", "出力_科目", "中科目", "分類"]
         | 
| 156 | 
            +
                            ].drop_duplicates()
         | 
| 157 | 
            +
                            sub_subject_similarity_mapper.predict_input(df_input_data=df_input_sub_subject)
         | 
| 158 | 
            +
                            
         | 
| 159 | 
            +
                            sub_subject_map_key_columns = ["科目", "標準科目", "出力_科目", "中科目", "分類"]
         | 
| 160 | 
            +
                            sub_subject_map_data_columns = [
         | 
| 161 | 
            +
                                "出力_基準中科目",
         | 
| 162 | 
            +
                                "出力_中科目類似度",
         | 
| 163 | 
            +
                                "出力_中科目",
         | 
| 164 | 
            +
                                "外部・内部区分",
         | 
| 165 | 
            +
                            ]
         | 
| 166 | 
            +
             | 
| 167 | 
            +
                            sub_subject_data_mapper = BaseDictionaryMapper(
         | 
| 168 | 
            +
                                df_input_sub_subject, sub_subject_map_key_columns, sub_subject_map_data_columns
         | 
| 169 | 
            +
                            )
         | 
| 170 | 
            +
                            sub_subject_data_mapper.map_data(
         | 
| 171 | 
            +
                                df_input_data=df_output_data,
         | 
| 172 | 
            +
                                input_key_columns=sub_subject_map_key_columns,
         | 
| 173 | 
            +
                                in_place=True,
         | 
| 174 | 
             
                            )
         | 
| 175 | 
            +
                            fillna_with_space(df_output_data)
         | 
| 176 |  | 
| 177 | 
             
                    except Exception as e:
         | 
| 178 | 
             
                        print(f"Error processing SubSubjectSimilarityMapper: {e}")
         | 
| 179 | 
             
                        raise HTTPException(status_code=500, detail=str(e))
         | 
| 180 |  | 
| 181 | 
            +
                    # NameSimilarityMapper
         | 
| 182 | 
             
                    try:
         | 
|  | |
| 183 | 
             
                        if sentence_service.df_name_map_data is not None:
         | 
| 184 | 
             
                            name_sentence_mapper = NameSimilarityMapper(
         | 
| 185 | 
             
                                cached_embedding_helper=sentence_service.name_cached_embedding_helper,
         | 
| 186 | 
             
                                df_map_data=sentence_service.df_name_map_data,
         | 
| 187 | 
             
                            )
         | 
| 188 | 
            +
                            name_sentence_mapper.predict_input(df_input_data=df_output_data)
         | 
| 189 | 
            +
                            fillna_with_space(df_output_data)
         | 
| 190 |  | 
| 191 | 
             
                    except Exception as e:
         | 
| 192 | 
             
                        print(f"Error processing NameSimilarityMapper: {e}")
         | 
| 193 | 
             
                        raise HTTPException(status_code=500, detail=str(e))
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 194 |  | 
| 195 | 
            +
                    # SubSubjectAndNameDataMapper
         | 
| 196 | 
             
                    try:
         | 
|  | |
| 197 | 
             
                        if sentence_service.df_sub_subject_and_name_map_data is not None:
         | 
| 198 | 
            +
                            sub_subject_and_name_data_mapper = SubSubjectAndNameDataMapper(
         | 
| 199 | 
             
                                df_map_data=sentence_service.df_sub_subject_and_name_map_data
         | 
| 200 | 
             
                            )
         | 
| 201 | 
            +
                            sub_subject_and_name_data_mapper.map_data(df_input_data=df_output_data)
         | 
| 202 |  | 
| 203 | 
             
                    except Exception as e:
         | 
| 204 | 
             
                        print(f"Error processing SubSubjectAndNameDataMapper: {e}")
         | 
| 205 | 
             
                        raise HTTPException(status_code=500, detail=str(e))
         | 
| 206 |  | 
| 207 | 
            +
                    # UnitMapper
         | 
| 208 | 
             
                    try:
         | 
| 209 | 
            +
                        if sentence_service.df_unit_map_data is not None:
         | 
| 210 | 
            +
                            unit_similarity_mapper = UnitMapper(
         | 
| 211 | 
            +
                                cached_embedding_helper=sentence_service.unit_cached_embedding_helper,
         | 
| 212 | 
            +
                                df_map_data=sentence_service.df_unit_map_data,
         | 
| 213 | 
            +
                            )
         | 
| 214 | 
            +
                            unit_map_key_columns = ["単位"]
         | 
| 215 | 
            +
                            df_input_unit = df_input_data[unit_map_key_columns].drop_duplicates()
         | 
| 216 | 
            +
                            unit_similarity_mapper.predict_input(df_input_data=df_input_unit)
         | 
| 217 | 
            +
                            
         | 
| 218 | 
            +
                            output_unit_data_columns = ["出力_基準単位", "出力_単位類似度", "出力_集計用単位", "出力_標準単位"]
         | 
| 219 | 
            +
                            unit_data_mapper = BaseDictionaryMapper(
         | 
| 220 | 
            +
                                df_input_unit, unit_map_key_columns, output_unit_data_columns
         | 
| 221 | 
            +
                            )
         | 
| 222 | 
            +
                            _ = unit_data_mapper.map_data(
         | 
| 223 | 
            +
                                df_input_data=df_output_data, input_key_columns=unit_map_key_columns, in_place=True
         | 
| 224 | 
            +
                            )
         | 
| 225 | 
            +
                            fillna_with_space(df_output_data)
         | 
| 226 | 
            +
                    except Exception as e:
         | 
| 227 | 
            +
                        print(f"Error processing UnitMapper: {e}")
         | 
| 228 | 
            +
                        raise HTTPException(status_code=500, detail=str(e))
         | 
|  | |
|  | |
|  | |
| 229 |  | 
| 230 | 
            +
                    # AbstractSimilarityMapper
         | 
| 231 | 
            +
                    try:
         | 
| 232 | 
            +
                        if sentence_service.df_abstract_map_data is not None:
         | 
| 233 | 
             
                            abstract_similarity_mapper = AbstractSimilarityMapper(
         | 
| 234 | 
             
                                cached_embedding_helper=sentence_service.abstract_cached_embedding_helper,
         | 
| 235 | 
             
                                df_map_data=sentence_service.df_abstract_map_data,
         | 
| 236 | 
             
                            )
         | 
| 237 | 
            +
                            abstract_similarity_mapper.predict_input(df_input_data=df_output_data)
         | 
|  | |
|  | |
| 238 |  | 
| 239 | 
             
                    except Exception as e:
         | 
| 240 | 
             
                        print(f"Error processing AbstractSimilarityMapper: {e}")
         | 
| 241 | 
             
                        print(f"DEBUG: Full error traceback:")
         | 
|  | |
|  | |
| 242 | 
             
                        traceback.print_exc()
         | 
| 243 | 
             
                        # Don't raise the exception, continue processing
         | 
| 244 | 
             
                        print(f"DEBUG: Continuing without AbstractSimilarityMapper...")
         | 
| 245 |  | 
| 246 | 
            +
                    # NameAndAbstractDataMapper
         | 
| 247 | 
             
                    try:
         | 
|  | |
| 248 | 
             
                        if sentence_service.df_name_and_subject_map_data is not None:
         | 
| 249 | 
             
                            name_and_abstract_mapper = NameAndAbstractDataMapper(
         | 
| 250 | 
             
                                df_map_data=sentence_service.df_name_and_subject_map_data
         | 
| 251 | 
             
                            )
         | 
| 252 | 
            +
                            df_output_data["出力_項目名"] = df_output_data["出力_標準名称"]
         | 
| 253 | 
            +
                            _ = name_and_abstract_mapper.map_data(df_output_data)
         | 
| 254 | 
            +
                            fillna_with_space(df_output_data)
         | 
| 255 | 
            +
                            df_output_data["出力_項目名(中科目抜き)"] = df_output_data["出力_項目名"]
         | 
| 256 |  | 
| 257 | 
             
                    except Exception as e:
         | 
| 258 | 
             
                        print(f"Error processing NameAndAbstractDataMapper: {e}")
         | 
| 259 | 
             
                        raise HTTPException(status_code=500, detail=str(e))
         | 
| 260 |  | 
| 261 | 
            +
                    # SubSubjectLocationDataMapper
         | 
| 262 | 
             
                    try:
         | 
| 263 | 
            +
                        sub_subject_location_mapper = SubSubjectLocationDataMapper()
         | 
| 264 | 
            +
                        sub_subject_location_mapper.map_location(df_output_data)
         | 
| 265 | 
            +
                        df_output_data["名称"] = df_output_data["元名称"]
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 266 | 
             
                    except Exception as e:
         | 
| 267 | 
            +
                        print(f"Error processing SubSubjectLocationDataMapper: {e}")
         | 
| 268 | 
             
                        raise HTTPException(status_code=500, detail=str(e))
         | 
| 269 |  | 
| 270 | 
             
                    # Create output columns and ensure they have proper values
         | 
|  | |
| 296 | 
             
                    for col, default_value in required_columns.items():
         | 
| 297 | 
             
                        if col not in df_output_data.columns:
         | 
| 298 | 
             
                            df_output_data[col] = default_value
         | 
|  | |
| 299 | 
             
                    # Map output columns to match Excel structure
         | 
| 300 | 
             
                    # 出力_中科目 mapping - use the standard sub-subject from sub-subject mapper
         | 
| 301 | 
             
                    if "出力_中科目" in df_output_data.columns:
         | 
|  | |
| 340 | 
             
                    print(f"Available columns after processing: {list(df_output_data.columns)}")
         | 
| 341 |  | 
| 342 | 
             
                    # Final check and fallback for missing output columns
         | 
| 343 | 
            +
                    # if (
         | 
| 344 | 
            +
                    #     "出力_中科目" not in df_output_data.columns
         | 
| 345 | 
            +
                    #     or df_output_data["出力_中科目"].eq("").all()
         | 
| 346 | 
            +
                    # ):
         | 
| 347 | 
            +
                    #     df_output_data["出力_中科目"] = df_output_data.get("中科目", "")
         | 
| 348 | 
            +
             | 
| 349 | 
            +
                    # if (
         | 
| 350 | 
            +
                    #     "出力_項目名" not in df_output_data.columns
         | 
| 351 | 
            +
                    #     or df_output_data["出力_項目名"].eq("").all()
         | 
| 352 | 
            +
                    # ):
         | 
| 353 | 
            +
                    #     df_output_data["出力_項目名"] = df_output_data.get("名称", "")
         | 
| 354 | 
            +
             | 
| 355 | 
            +
                    # if (
         | 
| 356 | 
            +
                    #     "出力_単位" not in df_output_data.columns
         | 
| 357 | 
            +
                    #     or df_output_data["出力_単位"].eq("").all()
         | 
| 358 | 
            +
                    # ):
         | 
| 359 | 
            +
                    #     df_output_data["出力_単位"] = df_output_data.get("単位", "")
         | 
| 360 | 
            +
             | 
| 361 | 
            +
                    # if "出力_確率度" not in df_output_data.columns:
         | 
| 362 | 
            +
                    #     df_output_data["出力_確率度"] = 0  # Default confidence score
         | 
| 363 |  | 
| 364 | 
             
                    # Define output columns in exact order as shown in Excel
         | 
| 365 | 
             
                    output_columns = [
         | 
|  | |
| 520 | 
             
                    try:
         | 
| 521 | 
             
                        # Unit mapping
         | 
| 522 | 
             
                        if sentence_service.df_unit_map_data is not None:
         | 
| 523 | 
            +
                            unit_mapper = UnitMapper(
         | 
| 524 | 
             
                                cached_embedding_helper=sentence_service.unit_cached_embedding_helper,
         | 
| 525 | 
             
                                df_map_data=sentence_service.df_unit_map_data,
         | 
| 526 | 
             
                            )
         | 
| 527 | 
             
                            unit_mapper.predict_input(df_input_data=df_input_data)
         | 
| 528 |  | 
| 529 | 
             
                    except Exception as e:
         | 
| 530 | 
            +
                        print(f"Error processing UnitMapper: {e}")
         | 
| 531 | 
             
                        raise HTTPException(status_code=500, detail=str(e))
         | 
| 532 |  | 
| 533 | 
             
                    # Ensure required columns exist
         | 
    	
        validate_optimization.py
    CHANGED
    
    | @@ -25,7 +25,7 @@ class FileComparator: | |
| 25 | 
             
                        '出力_中科目', 
         | 
| 26 | 
             
                        '出力_標準名称', 
         | 
| 27 | 
             
                        '出力_項目名', 
         | 
| 28 | 
            -
                        '出力_ | 
| 29 | 
             
                    ]
         | 
| 30 |  | 
| 31 | 
             
                def load_original_data(self) -> pd.DataFrame:
         | 
| @@ -236,7 +236,7 @@ def main(): | |
| 236 | 
             
                """Main function to compare two files"""
         | 
| 237 | 
             
                # File paths
         | 
| 238 | 
             
                original_file = "data/outputData_original.csv"
         | 
| 239 | 
            -
                second_file = "data/ | 
| 240 |  | 
| 241 | 
             
                if not os.path.exists(original_file):
         | 
| 242 | 
             
                    print(f"❌ Original file not found: {original_file}")
         | 
|  | |
| 25 | 
             
                        '出力_中科目', 
         | 
| 26 | 
             
                        '出力_標準名称', 
         | 
| 27 | 
             
                        '出力_項目名', 
         | 
| 28 | 
            +
                        '出力_集計用単位'
         | 
| 29 | 
             
                    ]
         | 
| 30 |  | 
| 31 | 
             
                def load_original_data(self) -> pd.DataFrame:
         | 
|  | |
| 236 | 
             
                """Main function to compare two files"""
         | 
| 237 | 
             
                # File paths
         | 
| 238 | 
             
                original_file = "data/outputData_original.csv"
         | 
| 239 | 
            +
                second_file = "data/outputData_api.csv"
         | 
| 240 |  | 
| 241 | 
             
                if not os.path.exists(original_file):
         | 
| 242 | 
             
                    print(f"❌ Original file not found: {original_file}")
         | 
