Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	| import inspect | |
| import os | |
| from typing import Dict, Any, Optional, List, Iterator | |
| from langchain.callbacks.manager import CallbackManagerForLLMRun | |
| from langchain.schema.output import GenerationChunk | |
| from pydantic import root_validator | |
| from langchain.llms import gpt4all | |
| from utils import FakeTokenizer, get_ngpus_vis, url_alive, download_simple | |
| def get_model_tokenizer_gpt4all(base_model, n_jobs=None, max_seq_len=None, llamacpp_dict=None): | |
| assert llamacpp_dict is not None | |
| # defaults (some of these are generation parameters, so need to be passed in at generation time) | |
| model_name = base_model.lower() | |
| model = get_llm_gpt4all(model_name, model=None, | |
| # max_new_tokens=max_new_tokens, | |
| # temperature=temperature, | |
| # repetition_penalty=repetition_penalty, | |
| # top_k=top_k, | |
| # top_p=top_p, | |
| # callbacks=callbacks, | |
| n_jobs=n_jobs, | |
| # verbose=verbose, | |
| # streaming=stream_output, | |
| # prompter=prompter, | |
| # context=context, | |
| # iinput=iinput, | |
| inner_class=True, | |
| max_seq_len=max_seq_len, | |
| llamacpp_dict=llamacpp_dict, | |
| ) | |
| return model, FakeTokenizer(), 'cpu' | |
| from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler | |
| class H2OStreamingStdOutCallbackHandler(StreamingStdOutCallbackHandler): | |
| def on_llm_new_token(self, token: str, **kwargs: Any) -> None: | |
| """Run on new LLM token. Only available when streaming is enabled.""" | |
| # streaming to std already occurs without this | |
| # sys.stdout.write(token) | |
| # sys.stdout.flush() | |
| pass | |
| def get_model_kwargs(llamacpp_dict, default_kwargs, cls, exclude_list=[]): | |
| # default from class | |
| model_kwargs = {k: v.default for k, v in dict(inspect.signature(cls).parameters).items() if k not in exclude_list} | |
| # from our defaults | |
| model_kwargs.update(default_kwargs) | |
| # from user defaults | |
| model_kwargs.update(llamacpp_dict) | |
| # ensure only valid keys | |
| func_names = list(inspect.signature(cls).parameters) | |
| model_kwargs = {k: v for k, v in model_kwargs.items() if k in func_names} | |
| # make int or float if can to satisfy types for class | |
| for k, v in model_kwargs.items(): | |
| try: | |
| if float(v) == int(v): | |
| model_kwargs[k] = int(v) | |
| else: | |
| model_kwargs[k] = float(v) | |
| except: | |
| pass | |
| return model_kwargs | |
| def get_gpt4all_default_kwargs(max_new_tokens=256, | |
| temperature=0.1, | |
| repetition_penalty=1.0, | |
| top_k=40, | |
| top_p=0.7, | |
| n_jobs=None, | |
| verbose=False, | |
| max_seq_len=None, | |
| ): | |
| if n_jobs in [None, -1]: | |
| n_jobs = int(os.getenv('OMP_NUM_THREADS', str(os.cpu_count()//2))) | |
| n_jobs = max(1, min(20, n_jobs)) # hurts beyond some point | |
| n_gpus = get_ngpus_vis() | |
| default_kwargs = dict(context_erase=0.5, | |
| n_batch=1, | |
| max_tokens=max_seq_len - max_new_tokens, | |
| n_predict=max_new_tokens, | |
| repeat_last_n=64 if repetition_penalty != 1.0 else 0, | |
| repeat_penalty=repetition_penalty, | |
| temp=temperature, | |
| temperature=temperature, | |
| top_k=top_k, | |
| top_p=top_p, | |
| use_mlock=True, | |
| n_ctx=max_seq_len, | |
| n_threads=n_jobs, | |
| verbose=verbose) | |
| if n_gpus != 0: | |
| default_kwargs.update(dict(n_gpu_layers=100)) | |
| return default_kwargs | |
| def get_llm_gpt4all(model_name, | |
| model=None, | |
| max_new_tokens=256, | |
| temperature=0.1, | |
| repetition_penalty=1.0, | |
| top_k=40, | |
| top_p=0.7, | |
| streaming=False, | |
| callbacks=None, | |
| prompter=None, | |
| context='', | |
| iinput='', | |
| n_jobs=None, | |
| verbose=False, | |
| inner_class=False, | |
| max_seq_len=None, | |
| llamacpp_dict=None, | |
| ): | |
| if not inner_class: | |
| assert prompter is not None | |
| default_kwargs = \ | |
| get_gpt4all_default_kwargs(max_new_tokens=max_new_tokens, | |
| temperature=temperature, | |
| repetition_penalty=repetition_penalty, | |
| top_k=top_k, | |
| top_p=top_p, | |
| n_jobs=n_jobs, | |
| verbose=verbose, | |
| max_seq_len=max_seq_len, | |
| ) | |
| if model_name == 'llama': | |
| cls = H2OLlamaCpp | |
| if model is None: | |
| llamacpp_dict = llamacpp_dict.copy() | |
| model_path = llamacpp_dict.pop('model_path_llama') | |
| if os.path.isfile(os.path.basename(model_path)): | |
| # e.g. if offline but previously downloaded | |
| model_path = os.path.basename(model_path) | |
| elif url_alive(model_path): | |
| # online | |
| ggml_path = os.getenv('GGML_PATH') | |
| dest = os.path.join(ggml_path, os.path.basename(model_path)) if ggml_path else None | |
| model_path = download_simple(model_path, dest=dest) | |
| else: | |
| model_path = model | |
| model_kwargs = get_model_kwargs(llamacpp_dict, default_kwargs, cls, exclude_list=['lc_kwargs']) | |
| model_kwargs.update(dict(model_path=model_path, callbacks=callbacks, streaming=streaming, | |
| prompter=prompter, context=context, iinput=iinput)) | |
| # migration to new langchain fix: | |
| odd_keys = ['model_kwargs', 'grammar_path', 'grammar'] | |
| for key in odd_keys: | |
| model_kwargs.pop(key, None) | |
| llm = cls(**model_kwargs) | |
| llm.client.verbose = verbose | |
| inner_model = llm.client | |
| elif model_name == 'gpt4all_llama': | |
| cls = H2OGPT4All | |
| if model is None: | |
| llamacpp_dict = llamacpp_dict.copy() | |
| model_path = llamacpp_dict.pop('model_name_gpt4all_llama') | |
| if url_alive(model_path): | |
| # online | |
| ggml_path = os.getenv('GGML_PATH') | |
| dest = os.path.join(ggml_path, os.path.basename(model_path)) if ggml_path else None | |
| model_path = download_simple(model_path, dest=dest) | |
| else: | |
| model_path = model | |
| model_kwargs = get_model_kwargs(llamacpp_dict, default_kwargs, cls, exclude_list=['lc_kwargs']) | |
| model_kwargs.update( | |
| dict(model=model_path, backend='llama', callbacks=callbacks, streaming=streaming, | |
| prompter=prompter, context=context, iinput=iinput)) | |
| llm = cls(**model_kwargs) | |
| inner_model = llm.client | |
| elif model_name == 'gptj': | |
| cls = H2OGPT4All | |
| if model is None: | |
| llamacpp_dict = llamacpp_dict.copy() | |
| model_path = llamacpp_dict.pop('model_name_gptj') if model is None else model | |
| if url_alive(model_path): | |
| ggml_path = os.getenv('GGML_PATH') | |
| dest = os.path.join(ggml_path, os.path.basename(model_path)) if ggml_path else None | |
| model_path = download_simple(model_path, dest=dest) | |
| else: | |
| model_path = model | |
| model_kwargs = get_model_kwargs(llamacpp_dict, default_kwargs, cls, exclude_list=['lc_kwargs']) | |
| model_kwargs.update( | |
| dict(model=model_path, backend='gptj', callbacks=callbacks, streaming=streaming, | |
| prompter=prompter, context=context, iinput=iinput)) | |
| llm = cls(**model_kwargs) | |
| inner_model = llm.client | |
| else: | |
| raise RuntimeError("No such model_name %s" % model_name) | |
| if inner_class: | |
| return inner_model | |
| else: | |
| return llm | |
| class H2OGPT4All(gpt4all.GPT4All): | |
| model: Any | |
| prompter: Any | |
| context: Any = '' | |
| iinput: Any = '' | |
| """Path to the pre-trained GPT4All model file.""" | |
| def validate_environment(cls, values: Dict) -> Dict: | |
| """Validate that the python package exists in the environment.""" | |
| try: | |
| if isinstance(values["model"], str): | |
| from gpt4all import GPT4All as GPT4AllModel | |
| full_path = values["model"] | |
| model_path, delimiter, model_name = full_path.rpartition("/") | |
| model_path += delimiter | |
| values["client"] = GPT4AllModel( | |
| model_name=model_name, | |
| model_path=model_path or None, | |
| model_type=values["backend"], | |
| allow_download=True, | |
| ) | |
| if values["n_threads"] is not None: | |
| # set n_threads | |
| values["client"].model.set_thread_count(values["n_threads"]) | |
| else: | |
| values["client"] = values["model"] | |
| if values["n_threads"] is not None: | |
| # set n_threads | |
| values["client"].model.set_thread_count(values["n_threads"]) | |
| try: | |
| values["backend"] = values["client"].model_type | |
| except AttributeError: | |
| # The below is for compatibility with GPT4All Python bindings <= 0.2.3. | |
| values["backend"] = values["client"].model.model_type | |
| except ImportError: | |
| raise ValueError( | |
| "Could not import gpt4all python package. " | |
| "Please install it with `pip install gpt4all`." | |
| ) | |
| return values | |
| def _call( | |
| self, | |
| prompt: str, | |
| stop: Optional[List[str]] = None, | |
| run_manager: Optional[CallbackManagerForLLMRun] = None, | |
| **kwargs, | |
| ) -> str: | |
| # Roughly 4 chars per token if natural language | |
| n_ctx = 2048 | |
| prompt = prompt[-self.max_tokens * 4:] | |
| # use instruct prompting | |
| data_point = dict(context=self.context, instruction=prompt, input=self.iinput) | |
| prompt = self.prompter.generate_prompt(data_point) | |
| verbose = False | |
| if verbose: | |
| print("_call prompt: %s" % prompt, flush=True) | |
| # FIXME: GPT4ALl doesn't support yield during generate, so cannot support streaming except via itself to stdout | |
| return super()._call(prompt, stop=stop, run_manager=run_manager) | |
| # FIXME: Unsure what uses | |
| #def get_token_ids(self, text: str) -> List[int]: | |
| # return self.client.tokenize(b" " + text.encode("utf-8")) | |
| from langchain.llms import LlamaCpp | |
| class H2OLlamaCpp(LlamaCpp): | |
| model_path: Any | |
| prompter: Any | |
| context: Any | |
| iinput: Any | |
| """Path to the pre-trained GPT4All model file.""" | |
| def validate_environment(cls, values: Dict) -> Dict: | |
| """Validate that llama-cpp-python library is installed.""" | |
| if isinstance(values["model_path"], str): | |
| model_path = values["model_path"] | |
| model_param_names = [ | |
| "lora_path", | |
| "lora_base", | |
| "n_ctx", | |
| "n_parts", | |
| "seed", | |
| "f16_kv", | |
| "logits_all", | |
| "vocab_only", | |
| "use_mlock", | |
| "n_threads", | |
| "n_batch", | |
| "use_mmap", | |
| "last_n_tokens_size", | |
| ] | |
| model_params = {k: values[k] for k in model_param_names} | |
| # For backwards compatibility, only include if non-null. | |
| if values["n_gpu_layers"] is not None: | |
| model_params["n_gpu_layers"] = values["n_gpu_layers"] | |
| try: | |
| try: | |
| from llama_cpp import Llama | |
| except ImportError: | |
| from llama_cpp_cuda import Llama | |
| values["client"] = Llama(model_path, **model_params) | |
| except ImportError: | |
| raise ModuleNotFoundError( | |
| "Could not import llama-cpp-python library. " | |
| "Please install the llama-cpp-python library to " | |
| "use this embedding model: pip install llama-cpp-python" | |
| ) | |
| except Exception as e: | |
| raise ValueError( | |
| f"Could not load Llama model from path: {model_path}. " | |
| f"Received error {e}" | |
| ) | |
| else: | |
| values["client"] = values["model_path"] | |
| return values | |
| def _call( | |
| self, | |
| prompt: str, | |
| stop: Optional[List[str]] = None, | |
| run_manager: Optional[CallbackManagerForLLMRun] = None, | |
| **kwargs, | |
| ) -> str: | |
| verbose = False | |
| # tokenize twice, just to count tokens, since llama cpp python wrapper has no way to truncate | |
| # still have to avoid crazy sizes, else hit llama_tokenize: too many tokens -- might still hit, not fatal | |
| prompt = prompt[-self.n_ctx * 4:] | |
| prompt_tokens = self.client.tokenize(b" " + prompt.encode("utf-8")) | |
| num_prompt_tokens = len(prompt_tokens) | |
| if num_prompt_tokens > self.n_ctx: | |
| # conservative by using int() | |
| chars_per_token = int(len(prompt) / num_prompt_tokens) | |
| prompt = prompt[-self.n_ctx * chars_per_token:] | |
| if verbose: | |
| print("reducing tokens, assuming average of %s chars/token: %s" % chars_per_token, flush=True) | |
| prompt_tokens2 = self.client.tokenize(b" " + prompt.encode("utf-8")) | |
| num_prompt_tokens2 = len(prompt_tokens2) | |
| print("reduced tokens from %d -> %d" % (num_prompt_tokens, num_prompt_tokens2), flush=True) | |
| # use instruct prompting | |
| data_point = dict(context=self.context, instruction=prompt, input=self.iinput) | |
| prompt = self.prompter.generate_prompt(data_point) | |
| if verbose: | |
| print("_call prompt: %s" % prompt, flush=True) | |
| if self.streaming: | |
| # parent handler of streamer expects to see prompt first else output="" and lose if prompt=None in prompter | |
| text = "" | |
| for token in self.stream(input=prompt, stop=stop): | |
| # for token in self.stream(input=prompt, stop=stop, run_manager=run_manager): | |
| text_chunk = token # ["choices"][0]["text"] | |
| # self.stream already calls text_callback | |
| # if text_callback: | |
| # text_callback(text_chunk) | |
| text += text_chunk | |
| # parent handler of streamer expects to see prompt first else output="" and lose if prompt=None in prompter | |
| return text[len(prompt):] | |
| else: | |
| params = self._get_parameters(stop) | |
| params = {**params, **kwargs} | |
| result = self.client(prompt=prompt, **params) | |
| return result["choices"][0]["text"] | |
| def _stream( | |
| self, | |
| prompt: str, | |
| stop: Optional[List[str]] = None, | |
| run_manager: Optional[CallbackManagerForLLMRun] = None, | |
| **kwargs: Any, | |
| ) -> Iterator[GenerationChunk]: | |
| # parent handler of streamer expects to see prompt first else output="" and lose if prompt=None in prompter | |
| logprobs = 0 | |
| chunk = GenerationChunk( | |
| text=prompt, | |
| generation_info={"logprobs": logprobs}, | |
| ) | |
| yield chunk | |
| if run_manager: | |
| run_manager.on_llm_new_token( | |
| token=chunk.text, verbose=self.verbose, log_probs=logprobs | |
| ) | |
| # actual new tokens | |
| for chunk in super()._stream(prompt, stop=stop, run_manager=run_manager, **kwargs): | |
| yield chunk | |
| def get_token_ids(self, text: str) -> List[int]: | |
| return self.client.tokenize(b" " + text.encode("utf-8")) | |
 
			
