File size: 1,483 Bytes
0ebbe65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47d18ca
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import os

# Set cache dirs (must match Dockerfile env vars)
os.environ['HOME'] = '/app'
os.environ['HF_HOME'] = '/app/.hf_cache'
os.environ['LANGTOOL_HOME'] = '/app/.ltool_cache'
os.environ['XDG_CACHE_HOME'] = '/app/.cache'

import language_tool_python
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import torch

def pre_cache_models():
    """
    Downloads and caches all required models and dependencies.
    This script is run during the Docker build process.
    """
    print("Caching LanguageTool model...")
    try:
        # This will download and cache the LanguageTool server files
        language_tool_python.LanguageTool('en-US')
        print("LanguageTool model cached successfully.")
    except Exception as e:
        print(f"Failed to cache LanguageTool: {e}")

    print("\nCaching Hugging Face models...")
    models_to_cache = [
        "vennify/t5-base-grammar-correction",
        "humarin/chatgpt_paraphraser_on_T5_base"
    ]

    for model_name in models_to_cache:
        try:
            print(f"Caching {model_name}...")
            # Cache both tokenizer and model files
            AutoTokenizer.from_pretrained(model_name)
            AutoModelForSeq2SeqLM.from_pretrained(model_name)
            print(f"{model_name} cached successfully.")
        except Exception as e:
            print(f"Failed to cache {model_name}: {e}")

    print("\nAll models have been cached.")

if __name__ == "__main__":
    pre_cache_models()