import transformers from huggingface_hub import snapshot_download,constants def download_llm_to_cache(model_name, revision="main", cache_dir=None): """ Download an LLM from the Hugging Face Hub to the cache without loading it into memory. Args: model_name (str): The name of the model on Hugging Face Hub (e.g., "meta-llama/Llama-2-7b-hf") revision (str, optional): The specific model version to use. Defaults to "main". cache_dir (str, optional): The cache directory to use. If None, uses the default HF cache directory. Returns: str: Path to the model in cache """ # Get default cache dir if not specified if cache_dir is None: cache_dir = constants.HUGGINGFACE_HUB_CACHE try: # Download model to cache without loading into memory cached_path = snapshot_download( repo_id=model_name, revision=revision, cache_dir=cache_dir, local_files_only=False # Set to True if you want to check local cache only ) print(f"Model '{model_name}' is available in cache at: {cached_path}") return cached_path except Exception as e: print(f"Error downloading model '{model_name}': {e}") return None def load_model(path,cache_dir=None): model = transformers.AutoModelForCausalLM.from_pretrained(path,cache_dir=cache_dir,device_map='auto',trust_remote_code=False) tokenizer = transformers.AutoTokenizer.from_pretrained(path,cache_dir=cache_dir,device_map='auto',trust_remote_code=False) return model,tokenizer def llm_run(model,tokenizer,genes,N): generate = transformers.pipeline('text-generation',model=model, tokenizer=tokenizer,device_map='auto') output = [] for i,gene in enumerate(genes): out = generate([gene], min_new_tokens=4, max_new_tokens=4, do_sample=True, num_return_sequences=N) output.append(out[0]) yield output return output