h2ogpt-chatbot2

Runtime error

App Files Files Community

pseudotensor commited on Oct 3, 2023

Commit

29ea64b

1 Parent(s): c596b21

Update with h2oGPT hash c7453f1b1ab51fb1cd342a9867f23cd7b538e000

Browse files

Files changed (4) hide show

src/client_test.py +8 -17
src/gen.py +7 -44
src/gpt_langchain.py +279 -337
src/gradio_runner.py +10 -7

src/client_test.py CHANGED Viewed

@@ -80,7 +80,7 @@ def get_args(prompt, prompt_type=None, chat=False, stream_output=False,
              version=None,
              h2ogpt_key=None,
              visible_models=None,
-             system_prompt='',  # default of no system prompt triggered by empty string
              add_search_to_context=False,
              chat_conversation=None,
              text_context_list=None,
@@ -256,18 +256,13 @@ def run_client_nochat_api(prompt, prompt_type, max_new_tokens, version=None, h2o
 @pytest.mark.skip(reason="For manual use against some server, no server launched")
-def test_client_basic_api_lean(prompt='Who are you?', prompt_type='human_bot', version=None, h2ogpt_key=None,
-                               chat_conversation=None, system_prompt=''):
-    return run_client_nochat_api_lean(prompt=prompt, prompt_type=prompt_type, max_new_tokens=50,
-                                      version=version, h2ogpt_key=h2ogpt_key,
-                                      chat_conversation=chat_conversation,
-                                      system_prompt=system_prompt)
-def run_client_nochat_api_lean(prompt, prompt_type, max_new_tokens, version=None, h2ogpt_key=None,
-                               chat_conversation=None, system_prompt=''):
-    kwargs = dict(instruction_nochat=prompt, h2ogpt_key=h2ogpt_key, chat_conversation=chat_conversation,
-                  system_prompt=system_prompt)
     api_name = '/submit_nochat_api'  # NOTE: like submit_nochat but stable API for string dict passing
     client = get_client(serialize=True)
@@ -367,9 +362,7 @@ def run_client_chat(prompt='',
                     langchain_agents=[],
                     prompt_type=None, prompt_dict=None,
                     version=None,
-                    h2ogpt_key=None,
-                    chat_conversation=None,
-                    system_prompt=''):
     client = get_client(serialize=False)
     kwargs, args = get_args(prompt, prompt_type, chat=True, stream_output=stream_output,
@@ -379,9 +372,7 @@ def run_client_chat(prompt='',
                             langchain_agents=langchain_agents,
                             prompt_dict=prompt_dict,
                             version=version,
-                            h2ogpt_key=h2ogpt_key,
-                            chat_conversation=chat_conversation,
-                            system_prompt=system_prompt)
     return run_client(client, prompt, args, kwargs)

              version=None,
              h2ogpt_key=None,
              visible_models=None,
+             system_prompt='',  # default of no system prompt tiggered by empty string
              add_search_to_context=False,
              chat_conversation=None,
              text_context_list=None,
 @pytest.mark.skip(reason="For manual use against some server, no server launched")
+def test_client_basic_api_lean(prompt_type='human_bot', version=None, h2ogpt_key=None):
+    return run_client_nochat_api_lean(prompt='Who are you?', prompt_type=prompt_type, max_new_tokens=50,
+                                      version=version, h2ogpt_key=h2ogpt_key)
+def run_client_nochat_api_lean(prompt, prompt_type, max_new_tokens, version=None, h2ogpt_key=None):
+    kwargs = dict(instruction_nochat=prompt, h2ogpt_key=h2ogpt_key)
     api_name = '/submit_nochat_api'  # NOTE: like submit_nochat but stable API for string dict passing
     client = get_client(serialize=True)
                     langchain_agents=[],
                     prompt_type=None, prompt_dict=None,
                     version=None,
+                    h2ogpt_key=None):
     client = get_client(serialize=False)
     kwargs, args = get_args(prompt, prompt_type, chat=True, stream_output=stream_output,
                             langchain_agents=langchain_agents,
                             prompt_dict=prompt_dict,
                             version=version,
+                            h2ogpt_key=h2ogpt_key)
     return run_client(client, prompt, args, kwargs)

src/gen.py CHANGED Viewed

@@ -335,7 +335,7 @@ def main(
                              Or Address can be for vLLM:
                               Use: "vllm:IP:port" for OpenAI-compliant vLLM endpoint
-                              Use: "vllm_chat:IP:port" for OpenAI-Chat-compliant vLLM endpoint
                              Or Address can be replicate:
                              Use:
@@ -2236,17 +2236,6 @@ def evaluate(
         instruction = instruction_nochat
         iinput = iinput_nochat
-    # avoid instruction in chat_conversation itself, since always used as additional context to prompt in what follows
-    if isinstance(chat_conversation, list) and \
-            len(chat_conversation) > 0 and \
-            len(chat_conversation[-1]) == 2 and \
-            chat_conversation[-1][0] == instruction:
-        chat_conversation = chat_conversation[:-1]
-    if not add_chat_history_to_context:
-        # make it easy to ignore without needing add_chat_history_to_context
-        # some langchain or unit test may need to then handle more general case
-        chat_conversation = []
     # in some cases, like lean nochat API, don't want to force sending prompt_type, allow default choice
     model_lower = base_model.lower()
     if not prompt_type and model_lower in inv_prompt_type_to_model_lower and prompt_type != 'custom':
@@ -2495,8 +2484,7 @@ def evaluate(
     prompt, \
         instruction, iinput, context, \
         num_prompt_tokens, max_new_tokens, num_prompt_tokens0, num_prompt_tokens_actual, \
-        chat_index, external_handle_chat_conversation, \
-        top_k_docs_trial, one_doc_size = \
         get_limited_prompt(instruction,
                            iinput,
                            tokenizer,
@@ -2564,6 +2552,8 @@ def evaluate(
                                                          sanitize_bot_response=sanitize_bot_response)
                         yield dict(response=response, sources=sources, save_dict=dict())
             elif inf_type == 'vllm_chat' or inference_server == 'openai_chat':
                 if system_prompt in [None, 'None', 'auto']:
                     openai_system_prompt = "You are a helpful assistant."
                 else:
@@ -2571,16 +2561,7 @@ def evaluate(
                 messages0 = []
                 if openai_system_prompt:
                     messages0.append({"role": "system", "content": openai_system_prompt})
-                if chat_conversation and add_chat_history_to_context:
-                    assert external_handle_chat_conversation, "Should be handling only externally"
-                    # chat_index handles token counting issues
-                    for message1 in chat_conversation[chat_index:]:
-                        if len(message1) == 2:
-                            messages0.append(
-                                {'role': 'user', 'content': message1[0] if message1[0] is not None else ''})
-                            messages0.append(
-                                {'role': 'assistant', 'content': message1[1] if message1[1] is not None else ''})
-                messages0.append({'role': 'user', 'content': prompt if prompt is not None else ''})
                 responses = openai.ChatCompletion.create(
                     model=base_model,
                     messages=messages0,
@@ -3628,27 +3609,13 @@ def get_limited_prompt(instruction,
         stream_output = prompter.stream_output
         system_prompt = prompter.system_prompt
-    generate_prompt_type = prompt_type
-    external_handle_chat_conversation = False
-    if any(inference_server.startswith(x) for x in ['openai_chat', 'openai_azure_chat', 'vllm_chat']):
-        # Chat APIs do not take prompting
-        # Replicate does not need prompting if no chat history, but in general can take prompting
-        # if using prompter, prompter.system_prompt will already be filled with automatic (e.g. from llama-2),
-        # so if replicate final prompt with system prompt still correct because only access prompter.system_prompt that was already set
-        # below already true for openai,
-        # but not vllm by default as that can be any model and handled by FastChat API inside vLLM itself
-        generate_prompt_type = 'plain'
-        # Chat APIs don't handle chat history via single prompt, but in messages, assumed to be handled outside this function
-        chat_conversation = []
-        external_handle_chat_conversation = True
     # merge handles if chat_conversation is None
     history = []
     history = merge_chat_conversation_history(chat_conversation, history)
     history_to_context_func = functools.partial(history_to_context,
                                                 langchain_mode=langchain_mode,
                                                 add_chat_history_to_context=add_chat_history_to_context,
-                                                prompt_type=generate_prompt_type,
                                                 prompt_dict=prompt_dict,
                                                 chat=chat,
                                                 model_max_length=model_max_length,
@@ -3781,9 +3748,6 @@ def get_limited_prompt(instruction,
         stream_output = False  # doesn't matter
         prompter = Prompter(prompt_type, prompt_dict, debug=debug, chat=chat, stream_output=stream_output,
                             system_prompt=system_prompt)
-        if prompt_type != generate_prompt_type:
-            # override just this attribute, keep system_prompt etc. from original prompt_type
-            prompter.prompt_type = generate_prompt_type
     data_point = dict(context=context, instruction=instruction, input=iinput)
     # handle promptA/promptB addition if really from history.
@@ -3796,8 +3760,7 @@ def get_limited_prompt(instruction,
     return prompt, \
         instruction, iinput, context, \
         num_prompt_tokens, max_new_tokens, num_prompt_tokens0, num_prompt_tokens_actual, \
-        chat_index, external_handle_chat_conversation, \
-        top_k_docs, one_doc_size
 def get_docs_tokens(tokenizer, text_context_list=[], max_input_tokens=None):

                              Or Address can be for vLLM:
                               Use: "vllm:IP:port" for OpenAI-compliant vLLM endpoint
+                              Note: vllm_chat not supported by vLLM project.
                              Or Address can be replicate:
                              Use:
         instruction = instruction_nochat
         iinput = iinput_nochat
     # in some cases, like lean nochat API, don't want to force sending prompt_type, allow default choice
     model_lower = base_model.lower()
     if not prompt_type and model_lower in inv_prompt_type_to_model_lower and prompt_type != 'custom':
     prompt, \
         instruction, iinput, context, \
         num_prompt_tokens, max_new_tokens, num_prompt_tokens0, num_prompt_tokens_actual, \
+        chat_index, top_k_docs_trial, one_doc_size = \
         get_limited_prompt(instruction,
                            iinput,
                            tokenizer,
                                                          sanitize_bot_response=sanitize_bot_response)
                         yield dict(response=response, sources=sources, save_dict=dict())
             elif inf_type == 'vllm_chat' or inference_server == 'openai_chat':
+                if inf_type == 'vllm_chat':
+                    raise NotImplementedError('%s not supported by vLLM' % inf_type)
                 if system_prompt in [None, 'None', 'auto']:
                     openai_system_prompt = "You are a helpful assistant."
                 else:
                 messages0 = []
                 if openai_system_prompt:
                     messages0.append({"role": "system", "content": openai_system_prompt})
+                messages0.append({'role': 'user', 'content': prompt})
                 responses = openai.ChatCompletion.create(
                     model=base_model,
                     messages=messages0,
         stream_output = prompter.stream_output
         system_prompt = prompter.system_prompt
     # merge handles if chat_conversation is None
     history = []
     history = merge_chat_conversation_history(chat_conversation, history)
     history_to_context_func = functools.partial(history_to_context,
                                                 langchain_mode=langchain_mode,
                                                 add_chat_history_to_context=add_chat_history_to_context,
+                                                prompt_type=prompt_type,
                                                 prompt_dict=prompt_dict,
                                                 chat=chat,
                                                 model_max_length=model_max_length,
         stream_output = False  # doesn't matter
         prompter = Prompter(prompt_type, prompt_dict, debug=debug, chat=chat, stream_output=stream_output,
                             system_prompt=system_prompt)
     data_point = dict(context=context, instruction=instruction, input=iinput)
     # handle promptA/promptB addition if really from history.
     return prompt, \
         instruction, iinput, context, \
         num_prompt_tokens, max_new_tokens, num_prompt_tokens0, num_prompt_tokens_actual, \
+        chat_index, top_k_docs, one_doc_size
 def get_docs_tokens(tokenizer, text_context_list=[], max_input_tokens=None):

src/gpt_langchain.py CHANGED Viewed

@@ -29,11 +29,10 @@ import yaml
 from joblib import delayed
 from langchain.callbacks import streaming_stdout
-from langchain.callbacks.base import Callbacks
 from langchain.embeddings import HuggingFaceInstructEmbeddings
 from langchain.llms.huggingface_pipeline import VALID_TASKS
 from langchain.llms.utils import enforce_stop_tokens
-from langchain.schema import LLMResult, Generation, PromptValue
 from langchain.tools import PythonREPLTool
 from langchain.tools.json.tool import JsonSpec
 from tqdm import tqdm
@@ -945,10 +944,7 @@ class H2OReplicate(Replicate):
         assert self.tokenizer is not None
         from h2oai_pipeline import H2OTextGenerationPipeline
         prompt, num_prompt_tokens = H2OTextGenerationPipeline.limit_prompt(prompt, self.tokenizer)
-        # Note Replicate handles the prompting of the specific model, but not if history, so just do it all on our side
-        data_point = dict(context=self.context, instruction=prompt, input=self.iinput)
-        prompt = self.prompter.generate_prompt(data_point)
         return super()._call(prompt, stop=stop, run_manager=run_manager, **kwargs)
     def get_token_ids(self, text: str) -> List[int]:
@@ -957,98 +953,21 @@ class H2OReplicate(Replicate):
         # return _get_token_ids_default_method(text)
-class ExtraChat:
-    def get_messages(self, prompts):
-        from langchain.schema import AIMessage, SystemMessage, HumanMessage
-        messages = []
-        if self.system_prompt:
-            messages.append(SystemMessage(content=self.system_prompt))
-        if self.chat_conversation:
-            for messages1 in self.chat_conversation:
-                messages.append(HumanMessage(content=messages1[0] if messages1[0] is not None else ''))
-                messages.append(AIMessage(content=messages1[1] if messages1[1] is not None else ''))
-        assert len(prompts) == 1, "Not implemented"
-        messages.append(HumanMessage(content=prompts[0].text if prompts[0].text is not None else ''))
-        return [messages]
-class H2OChatOpenAI(ChatOpenAI, ExtraChat):
-    tokenizer: Any = None  # for vllm_chat
-    system_prompt: Any = None
-    chat_conversation: Any = []
     @classmethod
     def _all_required_field_names(cls) -> Set:
         _all_required_field_names = super(ChatOpenAI, cls)._all_required_field_names()
         _all_required_field_names.update({'top_p', 'frequency_penalty', 'presence_penalty', 'logit_bias'})
         return _all_required_field_names
-    def get_token_ids(self, text: str) -> List[int]:
-        if self.tokenizer is not None:
-            return self.tokenizer.encode(text)
-        else:
-            # OpenAI uses tiktoken
-            return super().get_token_ids(text)
-    def generate_prompt(
-            self,
-            prompts: List[PromptValue],
-            stop: Optional[List[str]] = None,
-            callbacks: Callbacks = None,
-            **kwargs: Any,
-    ) -> LLMResult:
-        prompt_messages = self.get_messages(prompts)
-        # prompt_messages = [p.to_messages() for p in prompts]
-        return self.generate(prompt_messages, stop=stop, callbacks=callbacks, **kwargs)
-    async def agenerate_prompt(
-            self,
-            prompts: List[PromptValue],
-            stop: Optional[List[str]] = None,
-            callbacks: Callbacks = None,
-            **kwargs: Any,
-    ) -> LLMResult:
-        prompt_messages = self.get_messages(prompts)
-        # prompt_messages = [p.to_messages() for p in prompts]
-        return await self.agenerate(
-            prompt_messages, stop=stop, callbacks=callbacks, **kwargs
-        )
-class H2OAzureChatOpenAI(AzureChatOpenAI, ExtraChat):
-    system_prompt: Any = None
-    chat_conversation: Any = []
     @classmethod
     def _all_required_field_names(cls) -> Set:
         _all_required_field_names = super(AzureChatOpenAI, cls)._all_required_field_names()
         _all_required_field_names.update({'top_p', 'frequency_penalty', 'presence_penalty', 'logit_bias'})
         return _all_required_field_names
-    def generate_prompt(
-            self,
-            prompts: List[PromptValue],
-            stop: Optional[List[str]] = None,
-            callbacks: Callbacks = None,
-            **kwargs: Any,
-    ) -> LLMResult:
-        prompt_messages = self.get_messages(prompts)
-        # prompt_messages = [p.to_messages() for p in prompts]
-        return self.generate(prompt_messages, stop=stop, callbacks=callbacks, **kwargs)
-    async def agenerate_prompt(
-            self,
-            prompts: List[PromptValue],
-            stop: Optional[List[str]] = None,
-            callbacks: Callbacks = None,
-            **kwargs: Any,
-    ) -> LLMResult:
-        prompt_messages = self.get_messages(prompts)
-        # prompt_messages = [p.to_messages() for p in prompts]
-        return await self.agenerate(
-            prompt_messages, stop=stop, callbacks=callbacks, **kwargs
-        )
 class H2OAzureOpenAI(AzureOpenAI):
     @classmethod
@@ -1133,7 +1052,7 @@ def get_llm(use_openai_model=False,
         if 'meta/llama' in model_string:
             temperature = max(0.01, temperature if do_sample else 0)
         else:
-            temperature = temperature if do_sample else 0
         gen_kwargs = dict(temperature=temperature,
                           seed=1234,
                           max_length=max_new_tokens,  # langchain
@@ -1149,7 +1068,8 @@ def get_llm(use_openai_model=False,
         if system_prompt:
             gen_kwargs.update(dict(system_prompt=system_prompt))
-        # replicate handles prompting if no conversation, but in general has no chat API, so do all handling of prompting in h2oGPT
         if stream_output:
             callbacks = [StreamingGradioCallbackHandler()]
             streamer = callbacks[0] if stream_output else None
@@ -1188,8 +1108,8 @@ def get_llm(use_openai_model=False,
         if inf_type == 'openai_chat' or inf_type == 'vllm_chat':
             cls = H2OChatOpenAI
             # FIXME: Support context, iinput
-            if inf_type == 'vllm_chat':
-                kwargs_extra.update(dict(tokenizer=tokenizer))
             openai_api_key = openai.api_key
         elif inf_type == 'openai_azure_chat':
             cls = H2OAzureChatOpenAI
@@ -1248,8 +1168,6 @@ def get_llm(use_openai_model=False,
                   logit_bias=None if inf_type == 'vllm' else {},
                   max_retries=6,
                   streaming=stream_output,
-                  system_prompt=system_prompt,
-                  # chat_conversation=chat_conversation,  # don't do here, not token aware
                   **kwargs_extra
                   )
         streamer = callbacks[0] if stream_output else None
@@ -3582,6 +3500,7 @@ Respond to prompt of Final Answer with your final high-quality bullet list answe
         prompter = Prompter(prompt_type, prompt_dict, debug=False, chat=chat, stream_output=stream_output,
                             system_prompt=system_prompt)
     scores = []
     chain = None
@@ -3598,8 +3517,8 @@ Respond to prompt of Final Answer with your final high-quality bullet list answe
     missing_kwargs = [x for x in func_names if x not in sim_kwargs]
     assert not missing_kwargs, "Missing: %s" % missing_kwargs
     docs, chain, scores, \
-        num_docs_before_cut, \
-        use_llm_if_no_docs, top_k_docs_max_show = \
         get_chain(**sim_kwargs)
     if document_subset in non_query_commands:
         formatted_doc_chunks = '\n\n'.join([get_url(x) + '\n\n' + x.page_content for x in docs])
@@ -3620,21 +3539,23 @@ Respond to prompt of Final Answer with your final high-quality bullet list answe
         ret, extra = get_sources_answer(*get_answer_args, **get_answer_kwargs)
         yield dict(prompt=prompt_basic, response=formatted_doc_chunks, sources=extra, num_prompt_tokens=0)
         return
-    if langchain_mode not in langchain_modes_intrinsic and not use_llm_if_no_docs:
-        if not docs:
-            if langchain_action in [LangChainAction.SUMMARIZE_MAP.value,
-                                    LangChainAction.SUMMARIZE_ALL.value,
-                                    LangChainAction.SUMMARIZE_REFINE.value]:
-                ret = 'No relevant documents to summarize.' if num_docs_before_cut else 'No documents to summarize.'
-            else:
-                ret = 'No relevant documents to query (for chatting with LLM, pick Resources->Collections->LLM).' if num_docs_before_cut else 'No documents to query (for chatting with LLM, pick Resources->Collections->LLM).'
             extra = ''
             yield dict(prompt=prompt_basic, response=ret, sources=extra, num_prompt_tokens=0)
             return
-    # NOTE: If chain=None, could return if HF type (i.e. not langchain_only_model), but makes code too complex
-    # only return now if no chain at all, e.g. when only returning sources
-    if chain is None:
         return
     # context stuff similar to used in evaluate()
@@ -3735,8 +3656,7 @@ Respond to prompt of Final Answer with your final high-quality bullet list answe
         prompt = prompt_basic
     num_prompt_tokens = get_token_count(prompt, tokenizer)
-    if len(docs) == 0:
-        # if no docs, then no sources to cite
         ret = answer
         extra = ''
         yield dict(prompt=prompt, response=ret, sources=extra, num_prompt_tokens=num_prompt_tokens)
@@ -3895,7 +3815,8 @@ def get_chain(query=None,
     if text_context_list is None:
         text_context_list = []
-    # NOTE: Could try to establish if pure llm mode or not, but makes code too complex
     query_action = langchain_action == LangChainAction.QUERY.value
     summarize_action = langchain_action in [LangChainAction.SUMMARIZE_MAP.value,
                                             LangChainAction.SUMMARIZE_ALL.value,
@@ -3927,6 +3848,8 @@ def get_chain(query=None,
         add_search_to_context &= len(docs_search) > 0
         top_k_docs_max_show = max(top_k_docs_max_show, len(docs_search))
     use_llm_if_no_docs = True
     from src.output_parser import H2OMRKLOutputParser
@@ -3954,9 +3877,10 @@ def get_chain(query=None,
         docs = []
         scores = []
         num_docs_before_cut = 0
         use_llm_if_no_docs = True
-        return docs, target, scores, num_docs_before_cut, use_llm_if_no_docs, top_k_docs_max_show
     if LangChainAgent.COLLECTION.value in langchain_agents:
         output_parser = H2OMRKLOutputParser()
@@ -3975,9 +3899,10 @@ def get_chain(query=None,
         docs = []
         scores = []
         num_docs_before_cut = 0
         use_llm_if_no_docs = True
-        return docs, target, scores, num_docs_before_cut, use_llm_if_no_docs, top_k_docs_max_show
     if LangChainAgent.PYTHON.value in langchain_agents and inference_server.startswith('openai'):
         chain = create_python_agent(
@@ -3993,9 +3918,10 @@ def get_chain(query=None,
         docs = []
         scores = []
         num_docs_before_cut = 0
         use_llm_if_no_docs = True
-        return docs, target, scores, num_docs_before_cut, use_llm_if_no_docs, top_k_docs_max_show
     if LangChainAgent.PANDAS.value in langchain_agents and inference_server.startswith('openai_chat'):
         # FIXME: DATA
@@ -4012,9 +3938,10 @@ def get_chain(query=None,
         docs = []
         scores = []
         num_docs_before_cut = 0
         use_llm_if_no_docs = True
-        return docs, target, scores, num_docs_before_cut, use_llm_if_no_docs, top_k_docs_max_show
     if isinstance(document_choice, str):
         document_choice = [document_choice]
@@ -4044,9 +3971,10 @@ def get_chain(query=None,
         docs = []
         scores = []
         num_docs_before_cut = 0
         use_llm_if_no_docs = True
-        return docs, target, scores, num_docs_before_cut, use_llm_if_no_docs, top_k_docs_max_show
     if isinstance(document_choice, str):
         document_choice = [document_choice]
@@ -4057,7 +3985,7 @@ def get_chain(query=None,
     document_choice_agent = [x for x in document_choice_agent if x.endswith('.csv')]
     if LangChainAgent.CSV.value in langchain_agents and len(document_choice_agent) == 1 and document_choice_agent[
         0].endswith(
-        '.csv'):
         data_file = document_choice[0]
         if inference_server.startswith('openai_chat'):
             chain = create_csv_agent(
@@ -4078,9 +4006,19 @@ def get_chain(query=None,
         docs = []
         scores = []
         num_docs_before_cut = 0
         use_llm_if_no_docs = True
-        return docs, target, scores, num_docs_before_cut, use_llm_if_no_docs, top_k_docs_max_show
     # https://github.com/hwchase17/langchain/issues/1946
     # FIXME: Seems to way to get size of chroma db to limit top_k_docs to avoid
@@ -4152,7 +4090,8 @@ def get_chain(query=None,
                      pre_prompt_query, prompt_query,
                      pre_prompt_summary, prompt_summary,
                      langchain_action,
-                     True,  # just to overestimate prompting
                      auto_reduce_chunks,
                      got_db_docs,
                      add_search_to_context)
@@ -4160,242 +4099,239 @@ def get_chain(query=None,
     max_input_tokens = get_max_input_tokens(llm=llm, tokenizer=tokenizer, inference_server=inference_server,
                                             model_name=model_name, max_new_tokens=max_new_tokens)
-    if hasattr(db, '_persist_directory'):
-        lock_file = get_db_lock_file(db, lock_type='sim')
-    else:
-        base_path = 'locks'
-        base_path = makedirs(base_path, exist_ok=True, tmp_ok=True, use_base=True)
-        name_path = "sim.lock"
-        lock_file = os.path.join(base_path, name_path)
-    if not (isinstance(db, Chroma) or isinstance(db, ChromaMig) or ChromaMig.__name__ in str(db)):
-        # only chroma supports filtering
-        filter_kwargs = {}
-        filter_kwargs_backup = {}
-    else:
-        import logging
-        logging.getLogger("chromadb").setLevel(logging.ERROR)
-        assert document_choice is not None, "Document choice was None"
-        if isinstance(db, Chroma):
-            filter_kwargs_backup = {}  # shouldn't ever need backup
-            # chroma >= 0.4
-            if len(document_choice) == 0 or len(document_choice) >= 1 and document_choice[
-                0] == DocumentChoice.ALL.value:
-                filter_kwargs = {"filter": {"chunk_id": {"$gte": 0}}} if query_action else \
-                    {"filter": {"chunk_id": {"$eq": -1}}}
             else:
-                if document_choice[0] == DocumentChoice.ALL.value:
-                    document_choice = document_choice[1:]
-                if len(document_choice) == 0:
-                    filter_kwargs = {}
-                elif len(document_choice) > 1:
                     or_filter = [
-                        {"$and": [dict(source={"$eq": x}), dict(chunk_id={"$gte": 0})]} if query_action else {
-                            "$and": [dict(source={"$eq": x}), dict(chunk_id={"$eq": -1})]}
                         for x in document_choice]
                     filter_kwargs = dict(filter={"$or": or_filter})
-                else:
-                    # still chromadb UX bug, have to do different thing for 1 vs. 2+ docs when doing filter
                     one_filter = \
-                        [{"source": {"$eq": x}, "chunk_id": {"$gte": 0}} if query_action else {
-                            "source": {"$eq": x},
-                            "chunk_id": {
-                                "$eq": -1}}
                          for x in document_choice][0]
-                    filter_kwargs = dict(filter={"$and": [dict(source=one_filter['source']),
-                                                          dict(chunk_id=one_filter['chunk_id'])]})
-        else:
-            # migration for chroma < 0.4
-            if len(document_choice) == 0 or len(document_choice) >= 1 and document_choice[
-                0] == DocumentChoice.ALL.value:
-                filter_kwargs = {"filter": {"chunk_id": {"$gte": 0}}} if query_action else \
-                    {"filter": {"chunk_id": {"$eq": -1}}}
-                filter_kwargs_backup = {"filter": {"chunk_id": {"$gte": 0}}}
-            elif len(document_choice) >= 2:
-                if document_choice[0] == DocumentChoice.ALL.value:
-                    document_choice = document_choice[1:]
-                or_filter = [
-                    {"source": {"$eq": x}, "chunk_id": {"$gte": 0}} if query_action else {"source": {"$eq": x},
-                                                                                          "chunk_id": {
-                                                                                              "$eq": -1}}
-                    for x in document_choice]
-                filter_kwargs = dict(filter={"$or": or_filter})
-                or_filter_backup = [
-                    {"source": {"$eq": x}} if query_action else {"source": {"$eq": x}}
-                    for x in document_choice]
-                filter_kwargs_backup = dict(filter={"$or": or_filter_backup})
-            elif len(document_choice) == 1:
-                # degenerate UX bug in chroma
-                one_filter = \
-                    [{"source": {"$eq": x}, "chunk_id": {"$gte": 0}} if query_action else {"source": {"$eq": x},
-                                                                                           "chunk_id": {
-                                                                                               "$eq": -1}}
-                     for x in document_choice][0]
-                filter_kwargs = dict(filter=one_filter)
-                one_filter_backup = \
-                    [{"source": {"$eq": x}} if query_action else {"source": {"$eq": x}}
-                     for x in document_choice][0]
-                filter_kwargs_backup = dict(filter=one_filter_backup)
-            else:
-                # shouldn't reach
-                filter_kwargs = {}
-                filter_kwargs_backup = {}
-    if document_subset == DocumentSubset.TopKSources.name or query in [None, '', '\n']:
-        db_documents, db_metadatas = get_docs_and_meta(db, top_k_docs, filter_kwargs=filter_kwargs,
-                                                       text_context_list=text_context_list)
-        if len(db_documents) == 0 and filter_kwargs_backup:
-            db_documents, db_metadatas = get_docs_and_meta(db, top_k_docs, filter_kwargs=filter_kwargs_backup,
                                                            text_context_list=text_context_list)
-        if top_k_docs == -1:
-            top_k_docs = len(db_documents)
-        # similar to langchain's chroma's _results_to_docs_and_scores
-        docs_with_score = [(Document(page_content=result[0], metadata=result[1] or {}), 0)
-                           for result in zip(db_documents, db_metadatas)]
-        # set in metadata original order of docs
-        [x[0].metadata.update(orig_index=ii) for ii, x in enumerate(docs_with_score)]
-        # order documents
-        doc_hashes = [x.get('doc_hash', 'None') for x in db_metadatas]
-        if query_action:
-            doc_chunk_ids = [x.get('chunk_id', 0) for x in db_metadatas]
-            docs_with_score2 = [x for hx, cx, x in
-                                sorted(zip(doc_hashes, doc_chunk_ids, docs_with_score), key=lambda x: (x[0], x[1]))
-                                if cx >= 0]
-        else:
-            assert summarize_action
-            doc_chunk_ids = [x.get('chunk_id', -1) for x in db_metadatas]
-            docs_with_score2 = [x for hx, cx, x in
-                                sorted(zip(doc_hashes, doc_chunk_ids, docs_with_score), key=lambda x: (x[0], x[1]))
-                                if cx == -1
-                                ]
-            if len(docs_with_score2) == 0 and len(docs_with_score) > 0:
-                # old database without chunk_id, migration added 0 but didn't make -1 as that would be expensive
-                # just do again and relax filter, let summarize operate on actual chunks if nothing else
                 docs_with_score2 = [x for hx, cx, x in
-                                    sorted(zip(doc_hashes, doc_chunk_ids, docs_with_score),
-                                           key=lambda x: (x[0], x[1]))
                                     ]
-        docs_with_score = docs_with_score2
-        docs_with_score = docs_with_score[:top_k_docs]
-        docs = [x[0] for x in docs_with_score]
-        scores = [x[1] for x in docs_with_score]
-        num_docs_before_cut = len(docs)
-    else:
-        # for db=None too
-        with filelock.FileLock(lock_file):
-            docs_with_score, got_db_docs = get_docs_with_score(query, k_db, filter_kwargs, db, db_type,
-                                                               text_context_list=text_context_list,
-                                                               verbose=verbose)
-            if len(docs_with_score) == 0 and filter_kwargs_backup:
-                docs_with_score, got_db_docs = get_docs_with_score(query, k_db, filter_kwargs_backup, db,
-                                                                   db_type,
                                                                    text_context_list=text_context_list,
                                                                    verbose=verbose)
-        tokenizer = get_tokenizer(db=db, llm=llm, tokenizer=tokenizer, inference_server=inference_server,
-                                  use_openai_model=use_openai_model,
-                                  db_type=db_type)
-        # NOTE: if map_reduce, then no need to auto reduce chunks
-        if query_action and (top_k_docs == -1 or auto_reduce_chunks):
-            top_k_docs_tokenize = 100
-            docs_with_score = docs_with_score[:top_k_docs_tokenize]
-            if docs_with_score:
-                estimated_prompt_no_docs = template.format(context='', question=query)
-            else:
-                estimated_prompt_no_docs = template_if_no_docs.format(context='', question=query)
-            model_max_length = tokenizer.model_max_length
-            chat = True  # FIXME?
-            # first docs_with_score are most important with highest score
-            estimated_full_prompt, \
-                instruction, iinput, context, \
-                num_prompt_tokens, max_new_tokens, \
-                num_prompt_tokens0, num_prompt_tokens_actual, \
-                chat_index, external_handle_chat_conversation, \
-                top_k_docs_trial, one_doc_size = \
-                get_limited_prompt(estimated_prompt_no_docs,
-                                   iinput,
-                                   tokenizer,
-                                   prompter=prompter,
-                                   inference_server=inference_server,
-                                   prompt_type=prompt_type,
-                                   prompt_dict=prompt_dict,
-                                   chat=chat,
-                                   max_new_tokens=max_new_tokens,
-                                   system_prompt=system_prompt,
-                                   context=context,
-                                   chat_conversation=chat_conversation,
-                                   text_context_list=[x[0].page_content for x in docs_with_score],
-                                   keep_sources_in_context=keep_sources_in_context,
-                                   model_max_length=model_max_length,
-                                   memory_restriction_level=memory_restriction_level,
-                                   langchain_mode=langchain_mode,
-                                   add_chat_history_to_context=add_chat_history_to_context,
-                                   min_max_new_tokens=min_max_new_tokens,
-                                   )
-            if hasattr(llm, 'chat_conversation'):
-                # means LLM will handle
-                assert external_handle_chat_conversation, "Should be handling only externally"
-                llm.chat_conversation = chat_conversation[chat_index:]
-            if hasattr(llm, 'context'):
-                llm.context = context
-            if hasattr(llm, 'iinput'):
-                llm.iinput = iinput
-            # avoid craziness
-            if 0 < top_k_docs_trial < max_chunks:
                 # avoid craziness
-                if top_k_docs == -1:
-                    top_k_docs = top_k_docs_trial
                 else:
-                    top_k_docs = min(top_k_docs, top_k_docs_trial)
-            elif top_k_docs_trial >= max_chunks:
-                top_k_docs = max_chunks
-            if top_k_docs > 0:
-                docs_with_score = docs_with_score[:top_k_docs]
-            elif one_doc_size is not None:
-                docs_with_score = [docs_with_score[0][:one_doc_size]]
             else:
-                docs_with_score = []
-        else:
-            if total_tokens_for_docs is not None:
-                # used to limit tokens for summarization, e.g. public instance
-                top_k_docs, one_doc_size, num_doc_tokens = \
-                    get_docs_tokens(tokenizer,
-                                    text_context_list=[x[0].page_content for x in docs_with_score],
-                                    max_input_tokens=total_tokens_for_docs)
-            docs_with_score = docs_with_score[:top_k_docs]
-        # put most relevant chunks closest to question,
-        # esp. if truncation occurs will be "oldest" or "farthest from response" text that is truncated
-        # BUT: for small models, e.g. 6_9 pythia, if sees some stuff related to h2oGPT first, it can connect that and not listen to rest
-        if docs_ordering_type in ['best_first']:
-            pass
-        elif docs_ordering_type in ['best_near_prompt', 'reverse_sort']:
-            docs_with_score.reverse()
-        elif docs_ordering_type in ['', None, 'reverse_ucurve_sort']:
-            docs_with_score = reverse_ucurve_list(docs_with_score)
-        else:
-            raise ValueError("No such docs_ordering_type=%s" % docs_ordering_type)
-        # cut off so no high distance docs/sources considered
-        num_docs_before_cut = len(docs_with_score)
-        docs = [x[0] for x in docs_with_score if x[1] < cut_distance]
-        scores = [x[1] for x in docs_with_score if x[1] < cut_distance]
-        if len(scores) > 0 and verbose:
-            print("Distance: min: %s max: %s mean: %s median: %s" %
-                  (scores[0], scores[-1], np.mean(scores), np.median(scores)), flush=True)
-    # if HF type and have no docs, could bail out, but makes code too complex
     if document_subset in non_query_commands:
-        # no LLM use at all, just sources
-        return docs, None, [], num_docs_before_cut, use_llm_if_no_docs, top_k_docs_max_show
     # FIXME: WIP
     common_words_file = "data/NGSL_1.2_stats.csv.zip"
@@ -4413,6 +4349,7 @@ def get_chain(query=None,
     if len(docs) == 0:
         # avoid context == in prompt then
         template = template_if_no_docs
     got_db_docs = got_db_docs and len(text_context_list) < len(docs)
@@ -4424,7 +4361,8 @@ def get_chain(query=None,
                      pre_prompt_query, prompt_query,
                      pre_prompt_summary, prompt_summary,
                      langchain_action,
-                     got_db_docs,
                      auto_reduce_chunks,
                      got_db_docs,
                      add_search_to_context)
@@ -4442,7 +4380,10 @@ def get_chain(query=None,
         else:
             # only if use_openai_model = True, unused normally except in testing
             chain = load_qa_with_sources_chain(llm)
-        chain_kwargs = dict(input_documents=docs, question=query)
         target = wrapped_partial(chain, chain_kwargs)
     elif langchain_action in [LangChainAction.SUMMARIZE_MAP.value,
                               LangChainAction.SUMMARIZE_REFINE,
@@ -4486,7 +4427,7 @@ def get_chain(query=None,
     else:
         raise RuntimeError("No such langchain_action=%s" % langchain_action)
-    return docs, target, scores, num_docs_before_cut, use_llm_if_no_docs, top_k_docs_max_show
 def get_max_model_length(llm=None, tokenizer=None, inference_server=None, model_name=None):
@@ -4532,11 +4473,11 @@ def get_tokenizer(db=None, llm=None, tokenizer=None, inference_server=None, use_
     if hasattr(llm, 'pipeline') and hasattr(llm.pipeline, 'tokenizer'):
         # more accurate
         return llm.pipeline.tokenizer
-    elif hasattr(llm, 'tokenizer') and llm.tokenizer is not None:
         # e.g. TGI client mode etc.
         return llm.tokenizer
     elif inference_server in ['openai', 'openai_chat', 'openai_azure',
-                              'openai_azure_chat'] and tokenizer is not None:
         return tokenizer
     elif isinstance(tokenizer, FakeTokenizer):
         return tokenizer
@@ -4559,7 +4500,8 @@ def get_template(query, iinput,
                  pre_prompt_query, prompt_query,
                  pre_prompt_summary, prompt_summary,
                  langchain_action,
-                 got_docs,
                  auto_reduce_chunks,
                  got_db_docs,
                  add_search_to_context):
@@ -4581,7 +4523,7 @@ def get_template(query, iinput,
     if langchain_action == LangChainAction.QUERY.value:
         if iinput:
             query = "%s\n%s" % (query, iinput)
-        if not got_docs:
             template_if_no_docs = template = """{context}{question}"""
         else:
             template = """%s

 from joblib import delayed
 from langchain.callbacks import streaming_stdout
 from langchain.embeddings import HuggingFaceInstructEmbeddings
 from langchain.llms.huggingface_pipeline import VALID_TASKS
 from langchain.llms.utils import enforce_stop_tokens
+from langchain.schema import LLMResult, Generation
 from langchain.tools import PythonREPLTool
 from langchain.tools.json.tool import JsonSpec
 from tqdm import tqdm
         assert self.tokenizer is not None
         from h2oai_pipeline import H2OTextGenerationPipeline
         prompt, num_prompt_tokens = H2OTextGenerationPipeline.limit_prompt(prompt, self.tokenizer)
+        # Note Replicate handles the prompting of the specific model
         return super()._call(prompt, stop=stop, run_manager=run_manager, **kwargs)
     def get_token_ids(self, text: str) -> List[int]:
         # return _get_token_ids_default_method(text)
+class H2OChatOpenAI(ChatOpenAI):
     @classmethod
     def _all_required_field_names(cls) -> Set:
         _all_required_field_names = super(ChatOpenAI, cls)._all_required_field_names()
         _all_required_field_names.update({'top_p', 'frequency_penalty', 'presence_penalty', 'logit_bias'})
         return _all_required_field_names
+class H2OAzureChatOpenAI(AzureChatOpenAI):
     @classmethod
     def _all_required_field_names(cls) -> Set:
         _all_required_field_names = super(AzureChatOpenAI, cls)._all_required_field_names()
         _all_required_field_names.update({'top_p', 'frequency_penalty', 'presence_penalty', 'logit_bias'})
         return _all_required_field_names
 class H2OAzureOpenAI(AzureOpenAI):
     @classmethod
         if 'meta/llama' in model_string:
             temperature = max(0.01, temperature if do_sample else 0)
         else:
+            temperature =temperature if do_sample else 0
         gen_kwargs = dict(temperature=temperature,
                           seed=1234,
                           max_length=max_new_tokens,  # langchain
         if system_prompt:
             gen_kwargs.update(dict(system_prompt=system_prompt))
+        # replicate handles prompting, so avoid get_response() filter
+        prompter.prompt_type = 'plain'
         if stream_output:
             callbacks = [StreamingGradioCallbackHandler()]
             streamer = callbacks[0] if stream_output else None
         if inf_type == 'openai_chat' or inf_type == 'vllm_chat':
             cls = H2OChatOpenAI
             # FIXME: Support context, iinput
+            # if inf_type == 'vllm_chat':
+            #    kwargs_extra.update(dict(tokenizer=tokenizer))
             openai_api_key = openai.api_key
         elif inf_type == 'openai_azure_chat':
             cls = H2OAzureChatOpenAI
                   logit_bias=None if inf_type == 'vllm' else {},
                   max_retries=6,
                   streaming=stream_output,
                   **kwargs_extra
                   )
         streamer = callbacks[0] if stream_output else None
         prompter = Prompter(prompt_type, prompt_dict, debug=False, chat=chat, stream_output=stream_output,
                             system_prompt=system_prompt)
+    use_docs_planned = False
     scores = []
     chain = None
     missing_kwargs = [x for x in func_names if x not in sim_kwargs]
     assert not missing_kwargs, "Missing: %s" % missing_kwargs
     docs, chain, scores, \
+        use_docs_planned, num_docs_before_cut, \
+        use_llm_if_no_docs, llm_mode, top_k_docs_max_show = \
         get_chain(**sim_kwargs)
     if document_subset in non_query_commands:
         formatted_doc_chunks = '\n\n'.join([get_url(x) + '\n\n' + x.page_content for x in docs])
         ret, extra = get_sources_answer(*get_answer_args, **get_answer_kwargs)
         yield dict(prompt=prompt_basic, response=formatted_doc_chunks, sources=extra, num_prompt_tokens=0)
         return
+    if not use_llm_if_no_docs:
+        if not docs and langchain_action in [LangChainAction.SUMMARIZE_MAP.value,
+                                             LangChainAction.SUMMARIZE_ALL.value,
+                                             LangChainAction.SUMMARIZE_REFINE.value]:
+            ret = 'No relevant documents to summarize.' if num_docs_before_cut else 'No documents to summarize.'
+            extra = ''
+            yield dict(prompt=prompt_basic, response=ret, sources=extra, num_prompt_tokens=0)
+            return
+        if not docs and not llm_mode:
+            ret = 'No relevant documents to query (for chatting with LLM, pick Resources->Collections->LLM).' if num_docs_before_cut else 'No documents to query (for chatting with LLM, pick Resources->Collections->LLM).'
             extra = ''
             yield dict(prompt=prompt_basic, response=ret, sources=extra, num_prompt_tokens=0)
             return
+    if chain is None and not langchain_only_model:
+        # here if no docs at all and not HF type
+        # can only return if HF type
         return
     # context stuff similar to used in evaluate()
         prompt = prompt_basic
     num_prompt_tokens = get_token_count(prompt, tokenizer)
+    if not use_docs_planned:
         ret = answer
         extra = ''
         yield dict(prompt=prompt, response=ret, sources=extra, num_prompt_tokens=num_prompt_tokens)
     if text_context_list is None:
         text_context_list = []
+    # default value:
+    llm_mode = langchain_mode in ['Disabled', 'LLM'] and len(text_context_list) == 0
     query_action = langchain_action == LangChainAction.QUERY.value
     summarize_action = langchain_action in [LangChainAction.SUMMARIZE_MAP.value,
                                             LangChainAction.SUMMARIZE_ALL.value,
         add_search_to_context &= len(docs_search) > 0
         top_k_docs_max_show = max(top_k_docs_max_show, len(docs_search))
+    if len(text_context_list) > 0:
+        llm_mode = False
     use_llm_if_no_docs = True
     from src.output_parser import H2OMRKLOutputParser
         docs = []
         scores = []
+        use_docs_planned = False
         num_docs_before_cut = 0
         use_llm_if_no_docs = True
+        return docs, target, scores, use_docs_planned, num_docs_before_cut, use_llm_if_no_docs, llm_mode, top_k_docs_max_show
     if LangChainAgent.COLLECTION.value in langchain_agents:
         output_parser = H2OMRKLOutputParser()
         docs = []
         scores = []
+        use_docs_planned = False
         num_docs_before_cut = 0
         use_llm_if_no_docs = True
+        return docs, target, scores, use_docs_planned, num_docs_before_cut, use_llm_if_no_docs, llm_mode, top_k_docs_max_show
     if LangChainAgent.PYTHON.value in langchain_agents and inference_server.startswith('openai'):
         chain = create_python_agent(
         docs = []
         scores = []
+        use_docs_planned = False
         num_docs_before_cut = 0
         use_llm_if_no_docs = True
+        return docs, target, scores, use_docs_planned, num_docs_before_cut, use_llm_if_no_docs, llm_mode, top_k_docs_max_show
     if LangChainAgent.PANDAS.value in langchain_agents and inference_server.startswith('openai_chat'):
         # FIXME: DATA
         docs = []
         scores = []
+        use_docs_planned = False
         num_docs_before_cut = 0
         use_llm_if_no_docs = True
+        return docs, target, scores, use_docs_planned, num_docs_before_cut, use_llm_if_no_docs, llm_mode, top_k_docs_max_show
     if isinstance(document_choice, str):
         document_choice = [document_choice]
         docs = []
         scores = []
+        use_docs_planned = False
         num_docs_before_cut = 0
         use_llm_if_no_docs = True
+        return docs, target, scores, use_docs_planned, num_docs_before_cut, use_llm_if_no_docs, llm_mode, top_k_docs_max_show
     if isinstance(document_choice, str):
         document_choice = [document_choice]
     document_choice_agent = [x for x in document_choice_agent if x.endswith('.csv')]
     if LangChainAgent.CSV.value in langchain_agents and len(document_choice_agent) == 1 and document_choice_agent[
         0].endswith(
+            '.csv'):
         data_file = document_choice[0]
         if inference_server.startswith('openai_chat'):
             chain = create_csv_agent(
         docs = []
         scores = []
+        use_docs_planned = False
         num_docs_before_cut = 0
         use_llm_if_no_docs = True
+        return docs, target, scores, use_docs_planned, num_docs_before_cut, use_llm_if_no_docs, llm_mode, top_k_docs_max_show
+    # determine whether use of context out of docs is planned
+    if not use_openai_model and prompt_type not in ['plain'] or langchain_only_model:
+        if llm_mode:
+            use_docs_planned = False
+        else:
+            use_docs_planned = True
+    else:
+        use_docs_planned = True
     # https://github.com/hwchase17/langchain/issues/1946
     # FIXME: Seems to way to get size of chroma db to limit top_k_docs to avoid
                      pre_prompt_query, prompt_query,
                      pre_prompt_summary, prompt_summary,
                      langchain_action,
+                     llm_mode,
+                     use_docs_planned,
                      auto_reduce_chunks,
                      got_db_docs,
                      add_search_to_context)
     max_input_tokens = get_max_input_tokens(llm=llm, tokenizer=tokenizer, inference_server=inference_server,
                                             model_name=model_name, max_new_tokens=max_new_tokens)
+    if (db or text_context_list) and use_docs_planned:
+        if hasattr(db, '_persist_directory'):
+            lock_file = get_db_lock_file(db, lock_type='sim')
+        else:
+            base_path = 'locks'
+            base_path = makedirs(base_path, exist_ok=True, tmp_ok=True, use_base=True)
+            name_path = "sim.lock"
+            lock_file = os.path.join(base_path, name_path)
+        if not (isinstance(db, Chroma) or isinstance(db, ChromaMig) or ChromaMig.__name__ in str(db)):
+            # only chroma supports filtering
+            filter_kwargs = {}
+            filter_kwargs_backup = {}
+        else:
+            import logging
+            logging.getLogger("chromadb").setLevel(logging.ERROR)
+            assert document_choice is not None, "Document choice was None"
+            if isinstance(db, Chroma):
+                filter_kwargs_backup = {}  # shouldn't ever need backup
+                # chroma >= 0.4
+                if len(document_choice) == 0 or len(document_choice) >= 1 and document_choice[
+                    0] == DocumentChoice.ALL.value:
+                    filter_kwargs = {"filter": {"chunk_id": {"$gte": 0}}} if query_action else \
+                        {"filter": {"chunk_id": {"$eq": -1}}}
+                else:
+                    if document_choice[0] == DocumentChoice.ALL.value:
+                        document_choice = document_choice[1:]
+                    if len(document_choice) == 0:
+                        filter_kwargs = {}
+                    elif len(document_choice) > 1:
+                        or_filter = [
+                            {"$and": [dict(source={"$eq": x}), dict(chunk_id={"$gte": 0})]} if query_action else {
+                                "$and": [dict(source={"$eq": x}), dict(chunk_id={"$eq": -1})]}
+                            for x in document_choice]
+                        filter_kwargs = dict(filter={"$or": or_filter})
+                    else:
+                        # still chromadb UX bug, have to do different thing for 1 vs. 2+ docs when doing filter
+                        one_filter = \
+                            [{"source": {"$eq": x}, "chunk_id": {"$gte": 0}} if query_action else {
+                                "source": {"$eq": x},
+                                "chunk_id": {
+                                    "$eq": -1}}
+                             for x in document_choice][0]
+                        filter_kwargs = dict(filter={"$and": [dict(source=one_filter['source']),
+                                                              dict(chunk_id=one_filter['chunk_id'])]})
             else:
+                # migration for chroma < 0.4
+                if len(document_choice) == 0 or len(document_choice) >= 1 and document_choice[
+                    0] == DocumentChoice.ALL.value:
+                    filter_kwargs = {"filter": {"chunk_id": {"$gte": 0}}} if query_action else \
+                        {"filter": {"chunk_id": {"$eq": -1}}}
+                    filter_kwargs_backup = {"filter": {"chunk_id": {"$gte": 0}}}
+                elif len(document_choice) >= 2:
+                    if document_choice[0] == DocumentChoice.ALL.value:
+                        document_choice = document_choice[1:]
                     or_filter = [
+                        {"source": {"$eq": x}, "chunk_id": {"$gte": 0}} if query_action else {"source": {"$eq": x},
+                                                                                              "chunk_id": {
+                                                                                                  "$eq": -1}}
                         for x in document_choice]
                     filter_kwargs = dict(filter={"$or": or_filter})
+                    or_filter_backup = [
+                        {"source": {"$eq": x}} if query_action else {"source": {"$eq": x}}
+                        for x in document_choice]
+                    filter_kwargs_backup = dict(filter={"$or": or_filter_backup})
+                elif len(document_choice) == 1:
+                    # degenerate UX bug in chroma
                     one_filter = \
+                        [{"source": {"$eq": x}, "chunk_id": {"$gte": 0}} if query_action else {"source": {"$eq": x},
+                                                                                               "chunk_id": {
+                                                                                                   "$eq": -1}}
                          for x in document_choice][0]
+                    filter_kwargs = dict(filter=one_filter)
+                    one_filter_backup = \
+                        [{"source": {"$eq": x}} if query_action else {"source": {"$eq": x}}
+                         for x in document_choice][0]
+                    filter_kwargs_backup = dict(filter=one_filter_backup)
+                else:
+                    # shouldn't reach
+                    filter_kwargs = {}
+                    filter_kwargs_backup = {}
+        if llm_mode:
+            docs = []
+            scores = []
+        elif document_subset == DocumentSubset.TopKSources.name or query in [None, '', '\n']:
+            db_documents, db_metadatas = get_docs_and_meta(db, top_k_docs, filter_kwargs=filter_kwargs,
                                                            text_context_list=text_context_list)
+            if len(db_documents) == 0 and filter_kwargs_backup:
+                db_documents, db_metadatas = get_docs_and_meta(db, top_k_docs, filter_kwargs=filter_kwargs_backup,
+                                                               text_context_list=text_context_list)
+            if top_k_docs == -1:
+                top_k_docs = len(db_documents)
+            # similar to langchain's chroma's _results_to_docs_and_scores
+            docs_with_score = [(Document(page_content=result[0], metadata=result[1] or {}), 0)
+                               for result in zip(db_documents, db_metadatas)]
+            # set in metadata original order of docs
+            [x[0].metadata.update(orig_index=ii) for ii, x in enumerate(docs_with_score)]
+            # order documents
+            doc_hashes = [x.get('doc_hash', 'None') for x in db_metadatas]
+            if query_action:
+                doc_chunk_ids = [x.get('chunk_id', 0) for x in db_metadatas]
+                docs_with_score2 = [x for hx, cx, x in
+                                    sorted(zip(doc_hashes, doc_chunk_ids, docs_with_score), key=lambda x: (x[0], x[1]))
+                                    if cx >= 0]
+            else:
+                assert summarize_action
+                doc_chunk_ids = [x.get('chunk_id', -1) for x in db_metadatas]
                 docs_with_score2 = [x for hx, cx, x in
+                                    sorted(zip(doc_hashes, doc_chunk_ids, docs_with_score), key=lambda x: (x[0], x[1]))
+                                    if cx == -1
                                     ]
+                if len(docs_with_score2) == 0 and len(docs_with_score) > 0:
+                    # old database without chunk_id, migration added 0 but didn't make -1 as that would be expensive
+                    # just do again and relax filter, let summarize operate on actual chunks if nothing else
+                    docs_with_score2 = [x for hx, cx, x in
+                                        sorted(zip(doc_hashes, doc_chunk_ids, docs_with_score),
+                                               key=lambda x: (x[0], x[1]))
+                                        ]
+            docs_with_score = docs_with_score2
+            docs_with_score = docs_with_score[:top_k_docs]
+            docs = [x[0] for x in docs_with_score]
+            scores = [x[1] for x in docs_with_score]
+            num_docs_before_cut = len(docs)
+        else:
+            with filelock.FileLock(lock_file):
+                docs_with_score, got_db_docs = get_docs_with_score(query, k_db, filter_kwargs, db, db_type,
                                                                    text_context_list=text_context_list,
                                                                    verbose=verbose)
+                if len(docs_with_score) == 0 and filter_kwargs_backup:
+                    docs_with_score, got_db_docs = get_docs_with_score(query, k_db, filter_kwargs_backup, db,
+                                                                       db_type,
+                                                                       text_context_list=text_context_list,
+                                                                       verbose=verbose)
+            tokenizer = get_tokenizer(db=db, llm=llm, tokenizer=tokenizer, inference_server=inference_server,
+                                      use_openai_model=use_openai_model,
+                                      db_type=db_type)
+            # NOTE: if map_reduce, then no need to auto reduce chunks
+            if query_action and (top_k_docs == -1 or auto_reduce_chunks):
+                top_k_docs_tokenize = 100
+                docs_with_score = docs_with_score[:top_k_docs_tokenize]
+                prompt_no_docs = template.format(context='', question=query)
+                model_max_length = tokenizer.model_max_length
+                chat = True  # FIXME?
+                # first docs_with_score are most important with highest score
+                full_prompt, \
+                    instruction, iinput, context, \
+                    num_prompt_tokens, max_new_tokens, \
+                    num_prompt_tokens0, num_prompt_tokens_actual, \
+                    chat_index, top_k_docs_trial, one_doc_size = \
+                    get_limited_prompt(prompt_no_docs,
+                                       iinput,
+                                       tokenizer,
+                                       prompter=prompter,
+                                       inference_server=inference_server,
+                                       prompt_type=prompt_type,
+                                       prompt_dict=prompt_dict,
+                                       chat=chat,
+                                       max_new_tokens=max_new_tokens,
+                                       system_prompt=system_prompt,
+                                       context=context,
+                                       chat_conversation=chat_conversation,
+                                       text_context_list=[x[0].page_content for x in docs_with_score],
+                                       keep_sources_in_context=keep_sources_in_context,
+                                       model_max_length=model_max_length,
+                                       memory_restriction_level=memory_restriction_level,
+                                       langchain_mode=langchain_mode,
+                                       add_chat_history_to_context=add_chat_history_to_context,
+                                       min_max_new_tokens=min_max_new_tokens,
+                                       )
                 # avoid craziness
+                if 0 < top_k_docs_trial < max_chunks:
+                    # avoid craziness
+                    if top_k_docs == -1:
+                        top_k_docs = top_k_docs_trial
+                    else:
+                        top_k_docs = min(top_k_docs, top_k_docs_trial)
+                elif top_k_docs_trial >= max_chunks:
+                    top_k_docs = max_chunks
+                if top_k_docs > 0:
+                    docs_with_score = docs_with_score[:top_k_docs]
+                elif one_doc_size is not None:
+                    docs_with_score = [docs_with_score[0][:one_doc_size]]
                 else:
+                    docs_with_score = []
             else:
+                if total_tokens_for_docs is not None:
+                    # used to limit tokens for summarization, e.g. public instance
+                    top_k_docs, one_doc_size, num_doc_tokens = \
+                        get_docs_tokens(tokenizer,
+                                        text_context_list=[x[0].page_content for x in docs_with_score],
+                                        max_input_tokens=total_tokens_for_docs)
+                docs_with_score = docs_with_score[:top_k_docs]
+            # put most relevant chunks closest to question,
+            # esp. if truncation occurs will be "oldest" or "farthest from response" text that is truncated
+            # BUT: for small models, e.g. 6_9 pythia, if sees some stuff related to h2oGPT first, it can connect that and not listen to rest
+            if docs_ordering_type in ['best_first']:
+                pass
+            elif docs_ordering_type in ['best_near_prompt', 'reverse_sort']:
+                docs_with_score.reverse()
+            elif docs_ordering_type in ['', None, 'reverse_ucurve_sort']:
+                docs_with_score = reverse_ucurve_list(docs_with_score)
+            else:
+                raise ValueError("No such docs_ordering_type=%s" % docs_ordering_type)
+            # cut off so no high distance docs/sources considered
+            num_docs_before_cut = len(docs_with_score)
+            docs = [x[0] for x in docs_with_score if x[1] < cut_distance]
+            scores = [x[1] for x in docs_with_score if x[1] < cut_distance]
+            if len(scores) > 0 and verbose:
+                print("Distance: min: %s max: %s mean: %s median: %s" %
+                      (scores[0], scores[-1], np.mean(scores), np.median(scores)), flush=True)
+    else:
+        docs = []
+        scores = []
+    if not docs and use_docs_planned and not langchain_only_model:
+        # if HF type and have no docs, can bail out
+        return docs, None, [], False, num_docs_before_cut, use_llm_if_no_docs, llm_mode, top_k_docs_max_show
     if document_subset in non_query_commands:
+        # no LLM use
+        return docs, None, [], False, num_docs_before_cut, use_llm_if_no_docs, llm_mode, top_k_docs_max_show
     # FIXME: WIP
     common_words_file = "data/NGSL_1.2_stats.csv.zip"
     if len(docs) == 0:
         # avoid context == in prompt then
+        use_docs_planned = False
         template = template_if_no_docs
     got_db_docs = got_db_docs and len(text_context_list) < len(docs)
                      pre_prompt_query, prompt_query,
                      pre_prompt_summary, prompt_summary,
                      langchain_action,
+                     llm_mode,
+                     use_docs_planned,
                      auto_reduce_chunks,
                      got_db_docs,
                      add_search_to_context)
         else:
             # only if use_openai_model = True, unused normally except in testing
             chain = load_qa_with_sources_chain(llm)
+        if not use_docs_planned:
+            chain_kwargs = dict(input_documents=[], question=query)
+        else:
+            chain_kwargs = dict(input_documents=docs, question=query)
         target = wrapped_partial(chain, chain_kwargs)
     elif langchain_action in [LangChainAction.SUMMARIZE_MAP.value,
                               LangChainAction.SUMMARIZE_REFINE,
     else:
         raise RuntimeError("No such langchain_action=%s" % langchain_action)
+    return docs, target, scores, use_docs_planned, num_docs_before_cut, use_llm_if_no_docs, llm_mode, top_k_docs_max_show
 def get_max_model_length(llm=None, tokenizer=None, inference_server=None, model_name=None):
     if hasattr(llm, 'pipeline') and hasattr(llm.pipeline, 'tokenizer'):
         # more accurate
         return llm.pipeline.tokenizer
+    elif hasattr(llm, 'tokenizer'):
         # e.g. TGI client mode etc.
         return llm.tokenizer
     elif inference_server in ['openai', 'openai_chat', 'openai_azure',
+                              'openai_azure_chat']:
         return tokenizer
     elif isinstance(tokenizer, FakeTokenizer):
         return tokenizer
                  pre_prompt_query, prompt_query,
                  pre_prompt_summary, prompt_summary,
                  langchain_action,
+                 llm_mode,
+                 use_docs_planned,
                  auto_reduce_chunks,
                  got_db_docs,
                  add_search_to_context):
     if langchain_action == LangChainAction.QUERY.value:
         if iinput:
             query = "%s\n%s" % (query, iinput)
+        if llm_mode or not use_docs_planned:
             template_if_no_docs = template = """{context}{question}"""
         else:
             template = """%s

src/gradio_runner.py CHANGED Viewed

@@ -737,7 +737,8 @@ def go_gradio(**kwargs):
                         visible=True,
                         elem_id="langchain_agents",
                         filterable=False)
-                visible_doc_track = upload_visible and kwargs['visible_doc_track'] and not kwargs['large_file_count_mode']
                 row_doc_track = gr.Row(visible=visible_doc_track)
                 with row_doc_track:
                     if kwargs['langchain_mode'] in langchain_modes_non_db:
@@ -784,6 +785,9 @@ def go_gradio(**kwargs):
                         text_output_nochat_api = gr.Textbox(lines=5, label='API nochat output', visible=False,
                                                             show_copy_button=True)
                         # CHAT
                         col_chat = gr.Column(visible=kwargs['chat'])
                         with col_chat:
@@ -806,7 +810,8 @@ def go_gradio(**kwargs):
                                             size="sm",
                                             min_width=24,
                                             file_types=['.' + x for x in file_types],
-                                            file_count="multiple")
                                 submit_buttons = gr.Row(equal_height=False, visible=kwargs['visible_submit_buttons'])
                                 with submit_buttons:
@@ -886,11 +891,9 @@ def go_gradio(**kwargs):
                                                             visible=sources_visible and allow_upload_to_user_data)
                         with gr.Column(scale=4):
                             pass
                     with gr.Row():
                         with gr.Column(scale=1):
-                            visible_add_remove_collection = (allow_upload_to_user_data or
-                                                             allow_upload_to_my_data) and \
-                                                            kwargs['langchain_mode'] != 'Disabled'
                             add_placeholder = "e.g. UserData2, shared, user_path2" \
                                 if not is_public else "e.g. MyData2, personal (optional)"
                             remove_placeholder = "e.g. UserData2" if not is_public else "e.g. MyData2"
@@ -1143,7 +1146,8 @@ def go_gradio(**kwargs):
                         )
                         min_max_new_tokens = gr.Slider(
                             minimum=1, maximum=max_max_new_tokens, step=1,
-                            value=min(max_max_new_tokens, kwargs['min_max_new_tokens']), label="Min. of Max output length",
                         )
                         early_stopping = gr.Checkbox(label="EarlyStopping", info="Stop early in beam search",
                                                      value=kwargs['early_stopping'], visible=max_beams > 1)
@@ -2881,7 +2885,6 @@ def go_gradio(**kwargs):
             history = args_list[-1]
             if not history:
                 history = []
-            # NOTE: For these, could check if None, then automatically use CLI values, but too complex behavior
             prompt_type1 = args_list[eval_func_param_names.index('prompt_type')]
             prompt_dict1 = args_list[eval_func_param_names.index('prompt_dict')]
             langchain_mode1 = args_list[eval_func_param_names.index('langchain_mode')]

                         visible=True,
                         elem_id="langchain_agents",
                         filterable=False)
+                visible_doc_track = upload_visible and kwargs['visible_doc_track'] and not kwargs[
+                    'large_file_count_mode']
                 row_doc_track = gr.Row(visible=visible_doc_track)
                 with row_doc_track:
                     if kwargs['langchain_mode'] in langchain_modes_non_db:
                         text_output_nochat_api = gr.Textbox(lines=5, label='API nochat output', visible=False,
                                                             show_copy_button=True)
+                        visible_upload = (allow_upload_to_user_data or
+                                          allow_upload_to_my_data) and \
+                                         kwargs['langchain_mode'] != 'Disabled'
                         # CHAT
                         col_chat = gr.Column(visible=kwargs['chat'])
                         with col_chat:
                                             size="sm",
                                             min_width=24,
                                             file_types=['.' + x for x in file_types],
+                                            file_count="multiple",
+                                            visible=visible_upload)
                                 submit_buttons = gr.Row(equal_height=False, visible=kwargs['visible_submit_buttons'])
                                 with submit_buttons:
                                                             visible=sources_visible and allow_upload_to_user_data)
                         with gr.Column(scale=4):
                             pass
+                    visible_add_remove_collection = visible_upload
                     with gr.Row():
                         with gr.Column(scale=1):
                             add_placeholder = "e.g. UserData2, shared, user_path2" \
                                 if not is_public else "e.g. MyData2, personal (optional)"
                             remove_placeholder = "e.g. UserData2" if not is_public else "e.g. MyData2"
                         )
                         min_max_new_tokens = gr.Slider(
                             minimum=1, maximum=max_max_new_tokens, step=1,
+                            value=min(max_max_new_tokens, kwargs['min_max_new_tokens']),
+                            label="Min. of Max output length",
                         )
                         early_stopping = gr.Checkbox(label="EarlyStopping", info="Stop early in beam search",
                                                      value=kwargs['early_stopping'], visible=max_beams > 1)
             history = args_list[-1]
             if not history:
                 history = []
             prompt_type1 = args_list[eval_func_param_names.index('prompt_type')]
             prompt_dict1 = args_list[eval_func_param_names.index('prompt_dict')]
             langchain_mode1 = args_list[eval_func_param_names.index('langchain_mode')]