Spaces:

Detomo
/

meisaicheck-api

Running

App Files Files Community

vumichien commited on Mar 4

Commit

b77c0a2

1 Parent(s): dc7fc97

update project structure

Browse files

Files changed (12) hide show

auth.py +67 -0
config.py +19 -0
database.py +16 -0
main.py +25 -288
models.py +18 -0
routes/__init__.py +0 -0
routes/auth.py +28 -0
routes/health.py +8 -0
routes/predict.py +77 -0
services/__init__.py +0 -0
services/sentence_transformer_service.py +66 -0
utils.py +7 -0

auth.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from datetime import datetime, timedelta, timezone
+import jwt
+from fastapi import Depends, HTTPException, status
+from fastapi.security import OAuth2PasswordBearer
+from passlib.context import CryptContext
+from config import SECRET_KEY, ALGORITHM, ACCESS_TOKEN_EXPIRE_HOURS
+from models import TokenData, UserInDB, User
+from database import users_db
+from typing import Annotated, Optional
+from jwt.exceptions import InvalidTokenError
+pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
+oauth2_scheme = OAuth2PasswordBearer(tokenUrl="token")
+# Authentication helper functions
+def verify_password(plain_password, hashed_password):
+    return pwd_context.verify(plain_password, hashed_password)
+def get_user(db, username: str):
+    if username in db:
+        user_dict = db[username]
+        return UserInDB(**user_dict)
+    return None
+def authenticate_user(fake_db, username: str, password: str):
+    user = get_user(fake_db, username)
+    if not user:
+        return False
+    if not verify_password(password, user.hashed_password):
+        return False
+    return user
+def create_access_token(data: dict, expires_delta: Optional[timedelta] = None):
+    to_encode = data.copy()
+    if expires_delta:
+        expire = datetime.now(timezone.utc) + expires_delta
+    else:
+        expire = datetime.now(timezone.utc) + timedelta(hours=ACCESS_TOKEN_EXPIRE_HOURS)
+    to_encode.update({"exp": expire})
+    encoded_jwt = jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)
+    return encoded_jwt
+async def get_current_user(token: Annotated[str, Depends(oauth2_scheme)]):
+    credentials_exception = HTTPException(
+        status_code=status.HTTP_401_UNAUTHORIZED,
+        detail="Could not validate credentials",
+        headers={"WWW-Authenticate": "Bearer"},
+    )
+    try:
+        payload = jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
+        username = payload.get("sub")
+        if username is None:
+            raise credentials_exception
+        token_data = TokenData(username=username)
+    except InvalidTokenError:
+        raise credentials_exception
+    user = get_user(users_db, username=token_data.username)
+    if user is None:
+        raise credentials_exception
+    return user
+async def get_current_active_user(
+    current_user: Annotated[User, Depends(get_current_user)],
+):
+    if current_user.disabled:
+        raise HTTPException(status_code=400, detail="Inactive user")
+    return current_user

config.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import os
+# Security Config
+SECRET_KEY = "09d25e094faa6ca2556c818166b7a9563b93f7099f6f0f4caa6cf63b88e8d3e7"
+ALGORITHM = "HS256"
+ACCESS_TOKEN_EXPIRE_HOURS = 24
+# Paths
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+DATA_DIR = os.path.join(BASE_DIR, "data")
+UPLOAD_DIR = os.path.join(BASE_DIR, "uploads")
+OUTPUT_DIR = os.path.join(BASE_DIR, "outputs")
+SUBJECT_DATA_FILE = os.path.join(DATA_DIR, "subjectData.csv")
+SAMPLE_DATA_FILE = os.path.join(DATA_DIR, "sampleData.csv")
+# Model Names
+MODEL_NAME = "Detomo/cl-nagoya-sup-simcse-ja-for-standard-name-v1_0"
+SETENCE_EMBEDDING_FILE = os.path.join(DATA_DIR, "sample_name_sentence_embeddings(cl-nagoya-sup-simcse-ja-for-standard-name-v1_1).pkl")
+SETENCE_SIMILARITY_FILE = os.path.join(DATA_DIR, "sample_name_sentence_similarities(cl-nagoya-sup-simcse-ja-for-standard-name-v1_1).pkl")

database.py ADDED Viewed

	@@ -0,0 +1,16 @@

+users_db = {
+    "chien_vm": {
+        "username": "chien_vm",
+        "full_name": "Chien VM",
+        "email": "[email protected]",
+        "hashed_password": "$2b$12$RtcKFk7B3hKd7vYkwxdFN.eBXSiryQIRUG.OoJ07Pl9lzHNUkugMi",
+        "disabled": False,
+    },
+    "hoi_nv": {
+        "username": "hoi_nv",
+        "full_name": "Hoi NV",
+        "email": "[email protected]",
+        "hashed_password": "$2b$12$RtcKFk7B3hKd7vYkwxdFN.eBXSiryQIRUG.OoJ07Pl9lzHNUkugMi",
+        "disabled": False,
+    }
+}

main.py CHANGED Viewed

@@ -1,318 +1,55 @@
 import sys
 import os
-import time
-from fastapi import FastAPI, UploadFile, File, HTTPException, Depends, status
-from fastapi.responses import FileResponse
-from fastapi.security import OAuth2PasswordBearer, OAuth2PasswordRequestForm
 import uvicorn
 import traceback
-import pickle
-import shutil
-from pathlib import Path
 from contextlib import asynccontextmanager
-import pandas as pd
-from typing import Annotated, Optional, Union
-from datetime import datetime, timedelta, timezone
-import jwt
-from jwt.exceptions import InvalidTokenError
-from passlib.context import CryptContext
-from pydantic import BaseModel
 current_dir = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(os.path.join(current_dir, "meisai-check-ai"))
-from sentence_transformer_lib.sentence_transformer_helper import (
-    SentenceTransformerHelper,
-)
-from data_lib.input_name_data import InputNameData
-from data_lib.subject_data import SubjectData
-from data_lib.sample_name_data import SampleNameData
-from clustering_lib.sentence_clustering_lib import SentenceClusteringLib
-from data_lib.base_data import (
-    COL_STANDARD_NAME,
-    COL_STANDARD_NAME_KEY,
-    COL_STANDARD_SUBJECT,
-)
-from mapping_lib.name_mapping_helper import NameMappingHelper
-# Initialize global variables for model and data
-sentenceTransformerHelper = None
-dic_standard_subject = None
-sample_name_sentence_embeddings = None
-sample_name_sentence_similarities = None
-sampleData = None
-sentence_clustering_lib = None
-name_groups = None
-# Create data directory if it doesn't exist
-os.makedirs(os.path.join(current_dir, "data"), exist_ok=True)
-os.makedirs(os.path.join(current_dir, "uploads"), exist_ok=True)
-os.makedirs(os.path.join(current_dir, "outputs"), exist_ok=True)
-# Authentication related settings
-SECRET_KEY = "09d25e094faa6ca2556c818166b7a9563b93f7099f6f0f4caa6cf63b88e8d3e7"
-ALGORITHM = "HS256"
-ACCESS_TOKEN_EXPIRE_HOURS = 24  # Token expiration set to 24 hours
-# Password hashing context
-pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
-# OAuth2 scheme for token
-oauth2_scheme = OAuth2PasswordBearer(tokenUrl="token")
-# User database models
-class Token(BaseModel):
-    access_token: str
-    token_type: str
-class TokenData(BaseModel):
-    username: Optional[str] = None
-class User(BaseModel):
-    username: str
-    email: Optional[str] = None
-    full_name: Optional[str] = None
-    disabled: Optional[bool] = None
-class UserInDB(User):
-    hashed_password: str
-# Fake users database with hashed passwords
-users_db = {
-    "chien_vm": {
-        "username": "chien_vm",
-        "full_name": "Chien VM",
-        "email": "[email protected]",
-        "hashed_password": "$2b$12$RtcKFk7B3hKd7vYkwxdFN.eBXSiryQIRUG.OoJ07Pl9lzHNUkugMi",
-        "disabled": False,
-    },
-    "hoi_nv": {
-        "username": "hoi_nv",
-        "full_name": "Hoi NV",
-        "email": "[email protected]",
-        "hashed_password": "$2b$12$RtcKFk7B3hKd7vYkwxdFN.eBXSiryQIRUG.OoJ07Pl9lzHNUkugMi",
-        "disabled": False,
-    }
-}
-# Authentication helper functions
-def verify_password(plain_password, hashed_password):
-    return pwd_context.verify(plain_password, hashed_password)
-def get_user(db, username: str):
-    if username in db:
-        user_dict = db[username]
-        return UserInDB(**user_dict)
-    return None
-def authenticate_user(fake_db, username: str, password: str):
-    user = get_user(fake_db, username)
-    if not user:
-        return False
-    if not verify_password(password, user.hashed_password):
-        return False
-    return user
-def create_access_token(data: dict, expires_delta: Optional[timedelta] = None):
-    to_encode = data.copy()
-    if expires_delta:
-        expire = datetime.now(timezone.utc) + expires_delta
-    else:
-        expire = datetime.now(timezone.utc) + timedelta(hours=ACCESS_TOKEN_EXPIRE_HOURS)
-    to_encode.update({"exp": expire})
-    encoded_jwt = jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)
-    return encoded_jwt
-async def get_current_user(token: Annotated[str, Depends(oauth2_scheme)]):
-    credentials_exception = HTTPException(
-        status_code=status.HTTP_401_UNAUTHORIZED,
-        detail="Could not validate credentials",
-        headers={"WWW-Authenticate": "Bearer"},
-    )
-    try:
-        payload = jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
-        username = payload.get("sub")
-        if username is None:
-            raise credentials_exception
-        token_data = TokenData(username=username)
-    except InvalidTokenError:
-        raise credentials_exception
-    user = get_user(users_db, username=token_data.username)
-    if user is None:
-        raise credentials_exception
-    return user
-async def get_current_active_user(
-    current_user: Annotated[User, Depends(get_current_user)],
-):
-    if current_user.disabled:
-        raise HTTPException(status_code=400, detail="Inactive user")
-    return current_user
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     """Lifespan context manager for startup and shutdown events"""
-    global sentenceTransformerHelper, dic_standard_subject, sample_name_sentence_embeddings
-    global sample_name_sentence_similarities, sampleData, sentence_clustering_lib, name_groups
     try:
-        # Load sentence transformer model
-        sentenceTransformerHelper = SentenceTransformerHelper(
-            convert_to_zenkaku_flag=True, replace_words=None, keywords=None
-        )
-        sentenceTransformerHelper.load_model_by_name(
-            "Detomo/cl-nagoya-sup-simcse-ja-for-standard-name-v1_0"
-        )
-        # Load standard subject dictionary
-        dic_standard_subject = SubjectData.create_standard_subject_dic_from_file(
-            "data/subjectData.csv"
-        )
-        # Load pre-computed embeddings and similarities
-        with open(
-            f"data/sample_name_sentence_embeddings(cl-nagoya-sup-simcse-ja-for-standard-name-v1_1).pkl",
-            "rb",
-        ) as f:
-            sample_name_sentence_embeddings = pickle.load(f)
-        with open(
-            f"data/sample_name_sentence_similarities(cl-nagoya-sup-simcse-ja-for-standard-name-v1_1).pkl",
-            "rb",
-        ) as f:
-            sample_name_sentence_similarities = pickle.load(f)
-        # Load and process sample data
-        sampleData = SampleNameData()
-        file_path = os.path.join(current_dir, "data", "sampleData.csv")
-        sampleData.load_data_from_csv(file_path)
-        sampleData.process_data()
-        # Create sentence clusters
-        sentence_clustering_lib = SentenceClusteringLib(sample_name_sentence_embeddings)
-        best_name_eps = 0.07
-        name_groups, _ = sentence_clustering_lib.create_sentence_cluster(best_name_eps)
-        sampleData._create_key_column(
-            COL_STANDARD_NAME_KEY, COL_STANDARD_SUBJECT, COL_STANDARD_NAME
-        )
-        sampleData.set_name_sentence_labels(name_groups)
-        sampleData.build_search_tree()
-        print("Models and data loaded successfully")
     except Exception as e:
         print(f"Error during startup: {e}")
         traceback.print_exc()
-    yield  # This is where the app runs
-    # Cleanup code (if needed) goes here
     print("Shutting down application")
-app = FastAPI(lifespan=lifespan)
-@app.get("/")
 async def root():
     return {"message": "Hello World"}
-@app.get("/health")
-async def health_check():
-    return {"status": "ok", "timestamp": time.time()}
-@app.post("/token")
-async def login_for_access_token(
-    form_data: Annotated[OAuth2PasswordRequestForm, Depends()]
-) -> Token:
-    """
-    Login endpoint to get an access token
-    """
-    user = authenticate_user(users_db, form_data.username, form_data.password)
-    if not user:
-        raise HTTPException(
-            status_code=status.HTTP_401_UNAUTHORIZED,
-            detail="Incorrect username or password",
-            headers={"WWW-Authenticate": "Bearer"},
-        )
-    access_token_expires = timedelta(hours=ACCESS_TOKEN_EXPIRE_HOURS)
-    access_token = create_access_token(
-        data={"sub": user.username}, expires_delta=access_token_expires
-    )
-    return Token(access_token=access_token, token_type="bearer")
-@app.post("/predict")
-async def predict(
-    current_user: Annotated[User, Depends(get_current_active_user)],
-    file: UploadFile = File(...)
-):
-    """
-    Process an input CSV file and return standardized names (requires authentication)
-    """
-    global sentenceTransformerHelper, dic_standard_subject, sample_name_sentence_embeddings
-    global sample_name_sentence_similarities, sampleData, name_groups
-    if not file.filename.endswith(".csv"):
-        raise HTTPException(status_code=400, detail="Only CSV files are supported")
-    # Save uploaded file
-    timestamp = int(time.time())
-    input_file_path = os.path.join(current_dir, "uploads", f"input_{timestamp}_{current_user.username}.csv")
-    # Use CSV format with correct extension
-    output_file_path = os.path.join(current_dir, "outputs", f"output_{timestamp}_{current_user.username}.csv")
-    try:
-        with open(input_file_path, "wb") as buffer:
-            shutil.copyfileobj(file.file, buffer)
-    finally:
-        file.file.close()
-    try:
-        # Process input data
-        inputData = InputNameData(dic_standard_subject)
-        inputData.load_data_from_csv(input_file_path)
-        inputData.process_data()
-        # Map standard names
-        nameMappingHelper = NameMappingHelper(
-            sentenceTransformerHelper,
-            inputData,
-            sampleData,
-            sample_name_sentence_embeddings,
-            sample_name_sentence_similarities,
-        )
-        df_predicted = nameMappingHelper.map_standard_names()
-        # Create output dataframe and save to CSV
-        print("Columns of inputData.dataframe", inputData.dataframe.columns)
-        column_to_keep = ['シート名', '行', '科目', '分類', '名称', '摘要', '備考']
-        output_df = inputData.dataframe[column_to_keep].copy()
-        output_df.reset_index(drop=False, inplace=True)
-        output_df.loc[:, "出力_科目"] = df_predicted["出力_科目"]
-        output_df.loc[:, "出力_項目名"] = df_predicted["出力_項目名"]
-        output_df.loc[:, "出力_確率度"] = df_predicted["出力_確率度"]
-        # Save with utf_8_sig encoding for Japanese Excel compatibility
-        output_df.to_csv(output_file_path, index=False, encoding="utf_8_sig")
-        # Return the file as a download with correct content type and headers
-        return FileResponse(
-            path=output_file_path,
-            filename=f"output_{Path(file.filename).stem}.csv",
-            media_type="text/csv",
-            headers={
-                "Content-Disposition": f'attachment; filename="output_{Path(file.filename).stem}.csv"',
-                "Content-Type": "application/x-www-form-urlencoded",
-            },
-        )
-    except Exception as e:
-        print(f"Error processing file: {e}")
-        traceback.print_exc()
-        raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=8000)

 import sys
 import os
+from fastapi import FastAPI
 import uvicorn
 import traceback
 from contextlib import asynccontextmanager
 current_dir = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(os.path.join(current_dir, "meisai-check-ai"))
+from routes import auth, predict, health
+from services.sentence_transformer_service import sentence_transformer_service
+from utils import create_directories
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     """Lifespan context manager for startup and shutdown events"""
     try:
+        # Load models and data ONCE at startup
+        sentence_transformer_service.load_model_data()
     except Exception as e:
         print(f"Error during startup: {e}")
         traceback.print_exc()
+    yield  # App chạy tại đây
     print("Shutting down application")
+# Initialize FastAPI
+app = FastAPI(
+    title="MeisaiCheck API",
+    description="API for MeisaiCheck AI System",
+    version="1.0",
+    lifespan=lifespan,
+    openapi_tags=[
+        {"name": "Health", "description": "Health check endpoints"},
+        {"name": "Authentication", "description": "User authentication and token management"},
+        {"name": "Prediction", "description": " Predict and process CSV files"},
+    ]
+)
+# Include Routers
+app.include_router(health.router, tags=["Health"])
+app.include_router(auth.router, tags=["Authentication"])
+app.include_router(predict.router, tags=["Prediction"])
+@app.get("/", tags=["Health"])
 async def root():
     return {"message": "Hello World"}
 if __name__ == "__main__":
+    create_directories()
     uvicorn.run(app, host="0.0.0.0", port=8000)

models.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from pydantic import BaseModel
+from typing import Optional
+class Token(BaseModel):
+    access_token: str
+    token_type: str
+class TokenData(BaseModel):
+    username: Optional[str] = None
+class User(BaseModel):
+    username: str
+    email: Optional[str] = None
+    full_name: Optional[str] = None
+    disabled: Optional[bool] = None
+class UserInDB(User):
+    hashed_password: str

routes/__init__.py ADDED Viewed

File without changes

routes/auth.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from fastapi import APIRouter, Depends, HTTPException, status
+from fastapi.security import OAuth2PasswordRequestForm
+from datetime import timedelta
+from auth import authenticate_user, create_access_token
+from models import Token
+from config import ACCESS_TOKEN_EXPIRE_HOURS
+from database import users_db
+router = APIRouter()
+@router.post("/token", response_model=Token)
+async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends()):
+    """
+    Endpoint để lấy access token bằng username và password
+    """
+    user = authenticate_user(users_db, form_data.username, form_data.password)
+    if not user:
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail="Incorrect username or password",
+            headers={"WWW-Authenticate": "Bearer"},
+        )
+    access_token_expires = timedelta(hours=ACCESS_TOKEN_EXPIRE_HOURS)
+    access_token = create_access_token(
+        data={"sub": user.username}, expires_delta=access_token_expires
+    )
+    return Token(access_token=access_token, token_type="bearer")

routes/health.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from fastapi import APIRouter
+import time
+router = APIRouter()
+@router.get("/health")
+async def health_check():
+    return {"status": "ok", "timestamp": time.time()}

routes/predict.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import os
+import time
+import shutil
+from pathlib import Path
+from fastapi import APIRouter, UploadFile, File, HTTPException, Depends
+from fastapi.responses import FileResponse
+from auth import get_current_user
+from services.sentence_transformer_service import SentenceTransformerService, sentence_transformer_service
+from data_lib.input_name_data import InputNameData
+from mapping_lib.name_mapping_helper import NameMappingHelper
+from config import UPLOAD_DIR, OUTPUT_DIR
+router = APIRouter()
+@router.post("/predict")
+async def predict(
+    current_user=Depends(get_current_user),
+    file: UploadFile = File(...),
+    sentence_service: SentenceTransformerService = Depends(lambda: sentence_transformer_service)
+):
+    """
+    Process an input CSV file and return standardized names (requires authentication)
+    """
+    if not file.filename.endswith(".csv"):
+        raise HTTPException(status_code=400, detail="Only CSV files are supported")
+    # Save uploaded file
+    timestamp = int(time.time())
+    input_file_path = os.path.join(UPLOAD_DIR, f"input_{timestamp}_{current_user.username}.csv")
+    output_file_path = os.path.join(OUTPUT_DIR, f"output_{timestamp}_{current_user.username}.csv")
+    try:
+        with open(input_file_path, "wb") as buffer:
+            shutil.copyfileobj(file.file, buffer)
+    finally:
+        file.file.close()
+    try:
+        # Process input data
+        inputData = InputNameData(sentence_service.dic_standard_subject)
+        inputData.load_data_from_csv(input_file_path)
+        inputData.process_data()
+        # Map standard names
+        nameMappingHelper = NameMappingHelper(
+            sentence_service.sentenceTransformerHelper,
+            inputData,
+            sentence_service.sampleData,
+            sentence_service.sample_name_sentence_embeddings,
+            sentence_service.sample_name_sentence_similarities,
+        )
+        df_predicted = nameMappingHelper.map_standard_names()
+        # Create output dataframe and save to CSV
+        column_to_keep = ['シート名', '行', '科目', '分類', '名称', '摘要', '備考']
+        output_df = inputData.dataframe[column_to_keep].copy()
+        output_df.reset_index(drop=False, inplace=True)
+        output_df.loc[:, "出力_科目"] = df_predicted["出力_科目"]
+        output_df.loc[:, "出力_項目名"] = df_predicted["出力_項目名"]
+        output_df.loc[:, "出力_確率度"] = df_predicted["出力_確率度"]
+        # Save with utf_8_sig encoding for Japanese Excel compatibility
+        output_df.to_csv(output_file_path, index=False, encoding="utf_8_sig")
+        return FileResponse(
+            path=output_file_path,
+            filename=f"output_{Path(file.filename).stem}.csv",
+            media_type="text/csv",
+            headers={
+                "Content-Disposition": f'attachment; filename="output_{Path(file.filename).stem}.csv"',
+                "Content-Type": "application/x-www-form-urlencoded",
+            },
+        )
+    except Exception as e:
+        print(f"Error processing file: {e}")
+        raise HTTPException(status_code=500, detail=str(e))

services/__init__.py ADDED Viewed

File without changes

services/sentence_transformer_service.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import pickle
+from config import (
+    MODEL_NAME,
+    SETENCE_EMBEDDING_FILE,
+    SETENCE_SIMILARITY_FILE,
+    SAMPLE_DATA_FILE, SUBJECT_DATA_FILE
+)
+from sentence_transformer_lib.sentence_transformer_helper import SentenceTransformerHelper
+from data_lib.subject_data import SubjectData
+from data_lib.sample_name_data import SampleNameData
+from clustering_lib.sentence_clustering_lib import SentenceClusteringLib
+from data_lib.base_data import COL_STANDARD_NAME_KEY, COL_STANDARD_SUBJECT, COL_STANDARD_NAME
+class SentenceTransformerService:
+    def __init__(self):
+        self.sentenceTransformerHelper = None
+        self.dic_standard_subject = None
+        self.sample_name_sentence_embeddings = None
+        self.sample_name_sentence_similarities = None
+        self.sampleData = None
+        self.sentence_clustering_lib = None
+        self.name_groups = None
+    def load_model_data(self):
+        """Load model and data only once at startup"""
+        if self.sentenceTransformerHelper is not None:
+            print("Model already loaded. Skipping reload.")
+            return  # Không load lại nếu đã có model
+        print("Loading models and data...")
+        # Load sentence transformer model
+        self.sentenceTransformerHelper = SentenceTransformerHelper(
+            convert_to_zenkaku_flag=True, replace_words=None, keywords=None
+        )
+        self.sentenceTransformerHelper.load_model_by_name(MODEL_NAME)
+        # Load standard subject dictionary
+        self.dic_standard_subject = SubjectData.create_standard_subject_dic_from_file(SUBJECT_DATA_FILE)
+        # Load pre-computed embeddings and similarities
+        with open(SETENCE_EMBEDDING_FILE, "rb") as f:
+            self.sample_name_sentence_embeddings = pickle.load(f)
+        with open(SETENCE_SIMILARITY_FILE, "rb") as f:
+            self.sample_name_sentence_similarities = pickle.load(f)
+        # Load and process sample data
+        self.sampleData = SampleNameData()
+        self.sampleData.load_data_from_csv(SAMPLE_DATA_FILE)
+        self.sampleData.process_data()
+        # Create sentence clusters
+        self.sentence_clustering_lib = SentenceClusteringLib(self.sample_name_sentence_embeddings)
+        best_name_eps = 0.07
+        self.name_groups, _ = self.sentence_clustering_lib.create_sentence_cluster(best_name_eps)
+        self.sampleData._create_key_column(
+            COL_STANDARD_NAME_KEY, COL_STANDARD_SUBJECT, COL_STANDARD_NAME
+        )
+        self.sampleData.set_name_sentence_labels(self.name_groups)
+        self.sampleData.build_search_tree()
+        print("Models and data loaded successfully")
+# Global instance (singleton)
+sentence_transformer_service = SentenceTransformerService()

utils.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import os
+from config import DATA_DIR, UPLOAD_DIR, OUTPUT_DIR
+def create_directories():
+    os.makedirs(DATA_DIR, exist_ok=True)
+    os.makedirs(UPLOAD_DIR, exist_ok=True)
+    os.makedirs(OUTPUT_DIR, exist_ok=True)