Spaces:
Sleeping
Sleeping
| import os | |
| import time | |
| import shutil | |
| import pandas as pd | |
| import traceback | |
| import sys | |
| from pathlib import Path | |
| from fastapi import APIRouter, UploadFile, File, HTTPException, Depends, Body | |
| from fastapi.responses import FileResponse | |
| from custom_auth import get_current_user_from_token | |
| from services.sentence_transformer_service import SentenceTransformerService, sentence_transformer_service | |
| # Add the path to import modules from meisai-check-ai | |
| sys.path.append(os.path.join(os.path.dirname(__file__), "..", "meisai-check-ai")) | |
| from mapping_lib.standard_subject_data_mapper import StandardSubjectDataMapper | |
| from mapping_lib.subject_similarity_mapper import SubjectSimilarityMapper | |
| from mapping_lib.sub_subject_similarity_mapper import SubSubjectSimilarityMapper | |
| from mapping_lib.name_similarity_mapper import NameSimilarityMapper | |
| from mapping_lib.sub_subject_and_name_data_mapper import SubSubjectAndNameDataMapper | |
| from mapping_lib.sub_subject_location_data_mapper import SubSubjectLocationDataMapper | |
| from mapping_lib.abstract_similarity_mapper import AbstractSimilarityMapper | |
| from mapping_lib.name_and_abstract_mapper import NameAndAbstractDataMapper | |
| from mapping_lib.unit_similarity_mapper import UnitSimilarityMapper | |
| from mapping_lib.standard_name_mapper import StandardNameMapper | |
| from config import UPLOAD_DIR, OUTPUT_DIR | |
| from models import ( | |
| EmbeddingRequest, | |
| PredictRawRequest, | |
| PredictRawResponse, | |
| PredictRecord, | |
| PredictResult, | |
| ) | |
| router = APIRouter() | |
| async def predict( | |
| current_user=Depends(get_current_user_from_token), | |
| file: UploadFile = File(...), | |
| sentence_service: SentenceTransformerService = Depends( | |
| lambda: sentence_transformer_service | |
| ), | |
| ): | |
| """ | |
| Process an input CSV file and return standardized names (requires authentication) | |
| """ | |
| if not file.filename.endswith(".csv"): | |
| raise HTTPException(status_code=400, detail="Only CSV files are supported") | |
| # Save uploaded file | |
| timestamp = int(time.time()) | |
| input_file_path = os.path.join(UPLOAD_DIR, f"input_{timestamp}_{current_user.username}.csv") | |
| output_file_path = os.path.join(OUTPUT_DIR, f"output_{timestamp}_{current_user.username}.csv") | |
| try: | |
| with open(input_file_path, "wb") as buffer: | |
| shutil.copyfileobj(file.file, buffer) | |
| finally: | |
| file.file.close() | |
| try: | |
| # Load input data | |
| start_time = time.time() | |
| df_input_data = pd.read_csv(input_file_path) | |
| # Ensure basic columns exist with default values | |
| basic_columns = { | |
| "シート名": "", | |
| "行": "", | |
| "科目": "", | |
| "中科目": "", | |
| "分類": "", | |
| "名称": "", | |
| "単位": "", | |
| "摘要": "", | |
| "備考": "", | |
| } | |
| for col, default_value in basic_columns.items(): | |
| if col not in df_input_data.columns: | |
| df_input_data[col] = default_value | |
| # Process data using the new mapping system similar to predict.py | |
| try: | |
| # Subject mapping | |
| if sentence_service.df_subject_map_data is not None: | |
| subject_similarity_mapper = SubjectSimilarityMapper( | |
| cached_embedding_helper=sentence_service.subject_cached_embedding_helper, | |
| df_map_data=sentence_service.df_subject_map_data, | |
| ) | |
| list_input_subject = df_input_data["科目"].unique() | |
| df_subject_data = pd.DataFrame({"科目": list_input_subject}) | |
| subject_similarity_mapper.predict_input_optimized(df_input_data=df_subject_data) | |
| output_subject_map = dict( | |
| zip(df_subject_data["科目"], df_subject_data["出力_科目"]) | |
| ) | |
| df_input_data["標準科目"] = df_input_data["科目"].map( | |
| output_subject_map | |
| ) | |
| df_input_data["出力_科目"] = df_input_data["科目"].map( | |
| output_subject_map | |
| ) | |
| except Exception as e: | |
| print(f"Error processing SubjectSimilarityMapper: {e}") | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| try: | |
| # Standard subject mapping | |
| if sentence_service.df_standard_subject_map_data is not None: | |
| standard_subject_data_mapper = StandardSubjectDataMapper( | |
| df_map_data=sentence_service.df_standard_subject_map_data | |
| ) | |
| df_output_data = standard_subject_data_mapper.map_data( | |
| df_input_data=df_input_data, | |
| input_key_columns=["出力_科目"], | |
| in_place=True, | |
| ) | |
| else: | |
| df_output_data = df_input_data.copy() | |
| except Exception as e: | |
| print(f"Error processing StandardSubjectDataMapper: {e}") | |
| # Continue with original data if standard subject mapping fails | |
| df_output_data = df_input_data.copy() | |
| try: | |
| # Sub subject mapping | |
| if sentence_service.df_sub_subject_map_data is not None: | |
| sub_subject_similarity_mapper = SubSubjectSimilarityMapper( | |
| cached_embedding_helper=sentence_service.sub_subject_cached_embedding_helper, | |
| df_map_data=sentence_service.df_sub_subject_map_data, | |
| ) | |
| sub_subject_similarity_mapper.predict_input_optimized( | |
| df_input_data=df_output_data | |
| ) | |
| df_output_data = df_output_data.fillna("") | |
| except Exception as e: | |
| print(f"Error processing SubSubjectSimilarityMapper: {e}") | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| try: | |
| # Name mapping | |
| if sentence_service.df_name_map_data is not None: | |
| name_sentence_mapper = NameSimilarityMapper( | |
| cached_embedding_helper=sentence_service.name_cached_embedding_helper, | |
| df_map_data=sentence_service.df_name_map_data, | |
| ) | |
| name_sentence_mapper.predict_input_optimized(df_input_data=df_output_data) | |
| except Exception as e: | |
| print(f"Error processing NameSimilarityMapper: {e}") | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| try: | |
| sub_subject_location_mapper = SubSubjectLocationDataMapper() | |
| sub_subject_location_mapper.map_location(df_output_data) | |
| except Exception as e: | |
| print(f"Error processing SubSubjectLocationDataMapper: {e}") | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| try: | |
| # Sub subject and name mapping | |
| if sentence_service.df_sub_subject_and_name_map_data is not None: | |
| sub_subject_and_name_mapper = SubSubjectAndNameDataMapper( | |
| df_map_data=sentence_service.df_sub_subject_and_name_map_data | |
| ) | |
| sub_subject_and_name_mapper.map_data(df_input_data=df_output_data) | |
| except Exception as e: | |
| print(f"Error processing SubSubjectAndNameDataMapper: {e}") | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| try: | |
| # Abstract mapping | |
| if sentence_service.df_abstract_map_data is not None: | |
| # Ensure required columns exist before AbstractSimilarityMapper | |
| required_columns_for_abstract = { | |
| "標準科目": "", | |
| "摘要グループ": "", | |
| "確定": "未確定", | |
| "摘要": "", | |
| "備考": "", | |
| } | |
| # Add missing columns with appropriate defaults | |
| for col, default_val in required_columns_for_abstract.items(): | |
| if col not in df_output_data.columns: | |
| df_output_data[col] = default_val | |
| print( | |
| f"DEBUG: Added missing column '{col}' with default value '{default_val}'" | |
| ) | |
| # Ensure data types are correct (convert to string to avoid type issues) | |
| for col in ["標準科目", "摘要グループ", "確定", "摘要", "備考"]: | |
| if col in df_output_data.columns: | |
| df_output_data[col] = df_output_data[col].astype(str).fillna("") | |
| abstract_similarity_mapper = AbstractSimilarityMapper( | |
| cached_embedding_helper=sentence_service.abstract_cached_embedding_helper, | |
| df_map_data=sentence_service.df_abstract_map_data, | |
| ) | |
| abstract_similarity_mapper.predict_input_optimized(df_input_data=df_output_data) | |
| print(f"DEBUG: AbstractSimilarityMapper completed successfully") | |
| except Exception as e: | |
| print(f"Error processing AbstractSimilarityMapper: {e}") | |
| print(f"DEBUG: Full error traceback:") | |
| import traceback | |
| traceback.print_exc() | |
| # Don't raise the exception, continue processing | |
| print(f"DEBUG: Continuing without AbstractSimilarityMapper...") | |
| try: | |
| # Name and abstract mapping | |
| if sentence_service.df_name_and_subject_map_data is not None: | |
| name_and_abstract_mapper = NameAndAbstractDataMapper( | |
| df_map_data=sentence_service.df_name_and_subject_map_data | |
| ) | |
| df_output_data = name_and_abstract_mapper.map_data(df_output_data) | |
| except Exception as e: | |
| print(f"Error processing NameAndAbstractDataMapper: {e}") | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| try: | |
| # Unit mapping | |
| if sentence_service.df_unit_map_data is not None: | |
| unit_mapper = UnitSimilarityMapper( | |
| cached_embedding_helper=sentence_service.unit_cached_embedding_helper, | |
| df_map_data=sentence_service.df_unit_map_data, | |
| ) | |
| unit_mapper.predict_input_optimized(df_input_data=df_output_data) | |
| except Exception as e: | |
| print(f"Error processing UnitMapper: {e}") | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| try: | |
| # Standard name mapping | |
| if sentence_service.df_standard_name_map_data is not None: | |
| standard_name_mapper = StandardNameMapper( | |
| df_map_data=sentence_service.df_standard_name_map_data | |
| ) | |
| df_output_data = standard_name_mapper.map_data(df_output_data) | |
| except Exception as e: | |
| print(f"Error processing StandardNameMapper: {e}") | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| # Create output columns and ensure they have proper values | |
| # Add ID column if not exists | |
| if "ID" not in df_output_data.columns: | |
| df_output_data.reset_index(drop=False, inplace=True) | |
| df_output_data.rename(columns={"index": "ID"}, inplace=True) | |
| df_output_data["ID"] = df_output_data["ID"] + 1 # Start from 1 | |
| # Ensure required columns exist with default values | |
| required_columns = { | |
| "シート名": "", | |
| "行": "", | |
| "科目": "", | |
| "中科目": "", | |
| "分類": "", | |
| "名称": "", | |
| "単位": "", | |
| "摘要": "", | |
| "備考": "", | |
| "出力_科目": "", | |
| "出力_中科目": "", | |
| "出力_項目名": "", | |
| "出力_標準単位": "", | |
| "出力_集計用単位": "", | |
| "出力_確率度": 0.0, | |
| } | |
| for col, default_value in required_columns.items(): | |
| if col not in df_output_data.columns: | |
| df_output_data[col] = default_value | |
| # Map output columns to match Excel structure | |
| # 出力_中科目 mapping - use the standard sub-subject from sub-subject mapper | |
| if "出力_中科目" in df_output_data.columns: | |
| df_output_data["出力_中科目"] = df_output_data["出力_中科目"] | |
| elif "出力_基準中科目" in df_output_data.columns: | |
| df_output_data["出力_中科目"] = df_output_data["出力_基準中科目"] | |
| elif "標準中科目" in df_output_data.columns: | |
| df_output_data["出力_中科目"] = df_output_data["標準中科目"] | |
| # 出力_項目名 mapping - use the final item name from name and abstract mapper | |
| if "出力_項目名" in df_output_data.columns: | |
| df_output_data["出力_項目名"] = df_output_data["出力_項目名"] | |
| elif "出力_標準名称" in df_output_data.columns: | |
| df_output_data["出力_項目名"] = df_output_data["出力_標準名称"] | |
| elif "出力_基準名称" in df_output_data.columns: | |
| df_output_data["出力_項目名"] = df_output_data["出力_基準名称"] | |
| # 出力_標準単位 mapping - use unit mapper result | |
| if "出力_標準単位" in df_output_data.columns: | |
| df_output_data["出力_標準単位"] = df_output_data["出力_標準単位"] | |
| # 出力_集計用単位 mapping - use unit mapper result | |
| if "出力_集計用単位" in df_output_data.columns: | |
| df_output_data["出力_集計用単位"] = df_output_data["出力_集計用単位"] | |
| # 出力_確率度 mapping - use the name similarity as main probability | |
| if "出力_名称類似度" in df_output_data.columns: | |
| df_output_data["出力_確率度"] = df_output_data["出力_名称類似度"] | |
| elif "出力_中科目類似度" in df_output_data.columns: | |
| df_output_data["出力_確率度"] = df_output_data["出力_中科目類似度"] | |
| elif "出力_摘要類似度" in df_output_data.columns: | |
| df_output_data["出力_確率度"] = df_output_data["出力_摘要類似度"] | |
| elif "出力_単位類似度" in df_output_data.columns: | |
| df_output_data["出力_確率度"] = df_output_data["出力_単位類似度"] | |
| else: | |
| df_output_data["出力_確率度"] = 0.0 | |
| # Fill NaN values and ensure all output columns have proper values | |
| df_output_data = df_output_data.fillna("") | |
| # Debug: Print available columns to see what we have | |
| print(f"Available columns after processing: {list(df_output_data.columns)}") | |
| # Final check and fallback for missing output columns | |
| if ( | |
| "出力_中科目" not in df_output_data.columns | |
| or df_output_data["出力_中科目"].eq("").all() | |
| ): | |
| df_output_data["出力_中科目"] = df_output_data.get("中科目", "") | |
| if ( | |
| "出力_項目名" not in df_output_data.columns | |
| or df_output_data["出力_項目名"].eq("").all() | |
| ): | |
| df_output_data["出力_項目名"] = df_output_data.get("名称", "") | |
| if ( | |
| "出力_単位" not in df_output_data.columns | |
| or df_output_data["出力_単位"].eq("").all() | |
| ): | |
| df_output_data["出力_単位"] = df_output_data.get("単位", "") | |
| if "出力_確率度" not in df_output_data.columns: | |
| df_output_data["出力_確率度"] = 0 # Default confidence score | |
| # Define output columns in exact order as shown in Excel | |
| output_columns = [ | |
| "ID", | |
| "シート名", | |
| "行", | |
| "科目", | |
| "中科目", | |
| "分類", | |
| "名称", | |
| "単位", | |
| "摘要", | |
| "備考", | |
| "出力_科目", | |
| "出力_中科目", | |
| "出力_項目名", | |
| "出力_確率度", | |
| "出力_標準単位", | |
| "出力_集計用単位", | |
| ] | |
| # Save with utf_8_sig encoding for Japanese Excel compatibility | |
| df_output_data[output_columns].to_csv( | |
| output_file_path, index=False, encoding="utf_8_sig" | |
| ) | |
| # Save all caches | |
| sentence_service.save_all_caches() | |
| end_time = time.time() | |
| execution_time = end_time - start_time | |
| print(f"Execution time: {execution_time} seconds") | |
| return FileResponse( | |
| path=output_file_path, | |
| filename=f"output_{Path(file.filename).stem}.csv", | |
| media_type="text/csv", | |
| headers={ | |
| "Content-Disposition": f'attachment; filename="output_{Path(file.filename).stem}.csv"', | |
| "Content-Type": "application/x-www-form-urlencoded", | |
| }, | |
| ) | |
| except Exception as e: | |
| print(f"Error processing file: {e}") | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| async def create_embeddings( | |
| request: EmbeddingRequest, | |
| current_user=Depends(get_current_user_from_token), | |
| sentence_service: SentenceTransformerService = Depends( | |
| lambda: sentence_transformer_service | |
| ), | |
| ): | |
| """ | |
| Create embeddings for a list of input sentences (requires authentication) | |
| """ | |
| try: | |
| start_time = time.time() | |
| embeddings = sentence_service.sentenceTransformerHelper.create_embeddings( | |
| request.sentences | |
| ) | |
| end_time = time.time() | |
| execution_time = end_time - start_time | |
| print(f"Execution time: {execution_time} seconds") | |
| # Convert numpy array to list for JSON serialization | |
| embeddings_list = embeddings.tolist() | |
| return {"embeddings": embeddings_list} | |
| except Exception as e: | |
| print(f"Error creating embeddings: {e}") | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| async def predict_raw( | |
| request: PredictRawRequest, | |
| current_user=Depends(get_current_user_from_token), | |
| sentence_service: SentenceTransformerService = Depends( | |
| lambda: sentence_transformer_service | |
| ), | |
| ): | |
| """ | |
| Process raw input records and return standardized names (requires authentication) | |
| """ | |
| try: | |
| # Convert input records to DataFrame | |
| records_dict = { | |
| "科目": [], | |
| "中科目": [], | |
| "分類": [], | |
| "名称": [], | |
| "単位": [], | |
| "摘要": [], | |
| "備考": [], | |
| "シート名": [], # Required by BaseNameData but not used | |
| "行": [], # Required by BaseNameData but not used | |
| } | |
| for record in request.records: | |
| records_dict["科目"].append(record.subject) | |
| records_dict["中科目"].append(record.sub_subject) | |
| records_dict["分類"].append(record.name_category) | |
| records_dict["名称"].append(record.name) | |
| records_dict["単位"].append("") # Default empty | |
| records_dict["摘要"].append(record.abstract or "") | |
| records_dict["備考"].append(record.memo or "") | |
| records_dict["シート名"].append("") # Placeholder | |
| records_dict["行"].append("") # Placeholder | |
| df_input_data = pd.DataFrame(records_dict) | |
| # Process data similar to the main predict function | |
| try: | |
| # Subject mapping | |
| if sentence_service.df_subject_map_data is not None: | |
| subject_similarity_mapper = SubjectSimilarityMapper( | |
| cached_embedding_helper=sentence_service.subject_cached_embedding_helper, | |
| df_map_data=sentence_service.df_subject_map_data, | |
| ) | |
| list_input_subject = df_input_data["科目"].unique() | |
| df_subject_data = pd.DataFrame({"科目": list_input_subject}) | |
| subject_similarity_mapper.predict_input(df_input_data=df_subject_data) | |
| output_subject_map = dict( | |
| zip(df_subject_data["科目"], df_subject_data["出力_科目"]) | |
| ) | |
| df_input_data["標準科目"] = df_input_data["科目"].map( | |
| output_subject_map | |
| ) | |
| df_input_data["出力_科目"] = df_input_data["科目"].map( | |
| output_subject_map | |
| ) | |
| else: | |
| df_input_data["標準科目"] = df_input_data["科目"] | |
| df_input_data["出力_科目"] = df_input_data["科目"] | |
| except Exception as e: | |
| print(f"Error processing SubjectSimilarityMapper: {e}") | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| try: | |
| # Name mapping (simplified for raw predict) | |
| if sentence_service.df_name_map_data is not None: | |
| name_sentence_mapper = NameSimilarityMapper( | |
| cached_embedding_helper=sentence_service.name_cached_embedding_helper, | |
| df_map_data=sentence_service.df_name_map_data, | |
| ) | |
| name_sentence_mapper.predict_input(df_input_data=df_input_data) | |
| except Exception as e: | |
| print(f"Error processing NameSimilarityMapper: {e}") | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| try: | |
| # Unit mapping | |
| if sentence_service.df_unit_map_data is not None: | |
| unit_mapper = UnitSimilarityMapper( | |
| cached_embedding_helper=sentence_service.unit_cached_embedding_helper, | |
| df_map_data=sentence_service.df_unit_map_data, | |
| ) | |
| unit_mapper.predict_input(df_input_data=df_input_data) | |
| except Exception as e: | |
| print(f"Error processing UnitSimilarityMapper: {e}") | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| # Ensure required columns exist | |
| for col in [ | |
| "確定", | |
| "出力_標準名称", | |
| "出力_名称類似度", | |
| "出力_標準単位", | |
| "出力_単位類似度", | |
| ]: | |
| if col not in df_input_data.columns: | |
| if col in ["出力_名称類似度", "出力_単位類似度"]: | |
| df_input_data[col] = 0.0 | |
| else: | |
| df_input_data[col] = "" | |
| # Convert results to response format | |
| results = [] | |
| for _, row in df_input_data.iterrows(): | |
| result = PredictResult( | |
| subject=row["科目"], | |
| sub_subject=row["中科目"], | |
| name_category=row["分類"], | |
| name=row["名称"], | |
| abstract=row["摘要"], | |
| memo=row["備考"], | |
| confirmed=row.get("確定", ""), | |
| standard_subject=row.get("出力_科目", row["科目"]), | |
| standard_name=row.get("出力_標準名称", ""), | |
| similarity_score=float(row.get("出力_名称類似度", 0.0)), | |
| ) | |
| results.append(result) | |
| # Save all caches | |
| sentence_service.save_all_caches() | |
| return PredictRawResponse(results=results) | |
| except Exception as e: | |
| print(f"Error processing records: {e}") | |
| raise HTTPException(status_code=500, detail=str(e)) | |