Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | @@ -1,50 +1,78 @@ | |
| 1 | 
             
            import os
         | 
| 2 | 
             
            import torch
         | 
| 3 | 
            -
            from transformers import AutoTokenizer, AutoConfig
         | 
| 4 | 
            -
            from optimum.intel.openvino import OVModelForCausalLM
         | 
| 5 | 
            -
            import openvino as ov
         | 
| 6 | 
             
            import gradio as gr
         | 
|  | |
|  | |
|  | |
|  | |
| 7 | 
             
            from typing import List, Tuple
         | 
| 8 | 
             
            from threading import Event, Thread
         | 
| 9 | 
            -
            from gradio_helper import make_demo
         | 
| 10 | 
            -
            from llm_config import SUPPORTED_LLM_MODELS
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 11 |  | 
| 12 | 
            -
            # Define model  | 
| 13 | 
            -
             | 
| 14 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 15 |  | 
| 16 | 
            -
            # Load model configuration
         | 
| 17 | 
            -
            model_configuration = SUPPORTED_LLM_MODELS[model_language][model_id]
         | 
| 18 | 
            -
            pt_model_id = model_configuration["model_id"]
         | 
| 19 | 
            -
            int4_model_dir = os.path.join(model_id, "INT4_compressed_weights")
         | 
| 20 |  | 
| 21 | 
            -
            #  | 
| 22 | 
            -
             | 
| 23 | 
            -
             | 
| 24 | 
            -
             | 
| 25 | 
            -
             | 
|  | |
|  | |
| 26 |  | 
| 27 | 
            -
             | 
| 28 | 
            -
             | 
| 29 | 
            -
             | 
| 30 | 
            -
             | 
| 31 | 
            -
             | 
| 32 | 
            -
             | 
| 33 | 
            -
            )
         | 
| 34 |  | 
| 35 | 
            -
             | 
| 36 | 
            -
                """
         | 
| 37 | 
            -
                Converts conversation history to tokens based on model configuration.
         | 
| 38 | 
            -
                """
         | 
| 39 | 
            -
                input_ids = tok.encode(history[-1][0])  # Simple example for tokenizing the last user input.
         | 
| 40 | 
            -
                return torch.LongTensor([input_ids])
         | 
| 41 |  | 
|  | |
| 42 | 
             
            def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id):
         | 
| 43 | 
            -
                """
         | 
| 44 | 
            -
                Generates the next part of the conversation.
         | 
| 45 | 
            -
                """
         | 
| 46 | 
             
                input_ids = convert_history_to_token(history)
         | 
|  | |
|  | |
|  | |
|  | |
| 47 | 
             
                streamer = TextIteratorStreamer(tok, timeout=3600.0, skip_prompt=True, skip_special_tokens=True)
         | 
|  | |
| 48 | 
             
                generate_kwargs = dict(
         | 
| 49 | 
             
                    input_ids=input_ids,
         | 
| 50 | 
             
                    max_new_tokens=256,
         | 
| @@ -55,20 +83,44 @@ def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id) | |
| 55 | 
             
                    repetition_penalty=repetition_penalty,
         | 
| 56 | 
             
                    streamer=streamer,
         | 
| 57 | 
             
                )
         | 
| 58 | 
            -
             | 
| 59 | 
            -
                #  | 
| 60 | 
            -
                 | 
| 61 | 
            -
             | 
| 62 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 63 | 
             
                partial_text = ""
         | 
| 64 | 
             
                for new_text in streamer:
         | 
| 65 | 
            -
                    partial_text  | 
| 66 | 
             
                    history[-1][1] = partial_text
         | 
| 67 | 
             
                    yield history
         | 
| 68 |  | 
| 69 | 
            -
             | 
| 70 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 71 |  | 
| 72 | 
            -
            # Gradio UI
         | 
| 73 | 
            -
            demo = make_demo(run_fn=bot, stop_fn=request_cancel, title="OpenVINO Chatbot", language="en")
         | 
| 74 | 
            -
            demo.launch(debug=True, share=True)
         | 
|  | |
| 1 | 
             
            import os
         | 
| 2 | 
             
            import torch
         | 
|  | |
|  | |
|  | |
| 3 | 
             
            import gradio as gr
         | 
| 4 | 
            +
            import ipywidgets as widgets
         | 
| 5 | 
            +
            from pathlib import Path
         | 
| 6 | 
            +
            from transformers import AutoConfig, AutoTokenizer
         | 
| 7 | 
            +
            from optimum.intel.openvino import OVModelForCausalLM
         | 
| 8 | 
             
            from typing import List, Tuple
         | 
| 9 | 
             
            from threading import Event, Thread
         | 
| 10 | 
            +
            from gradio_helper import make_demo  # Your helper function for Gradio demo
         | 
| 11 | 
            +
            from llm_config import SUPPORTED_LLM_MODELS  # Model configuration
         | 
| 12 | 
            +
            from notebook_utils import device_widget  # Device selection utility
         | 
| 13 | 
            +
            import openvino as ov
         | 
| 14 | 
            +
            import openvino.properties as props
         | 
| 15 | 
            +
            import openvino.properties.hint as hints
         | 
| 16 | 
            +
            import openvino.properties.streams as streams
         | 
| 17 | 
            +
            import requests
         | 
| 18 |  | 
| 19 | 
            +
            # Define the model loading function (same as in your notebook)
         | 
| 20 | 
            +
            def convert_to_int4(model_id, model_configuration, enable_awq=False):
         | 
| 21 | 
            +
                # Model conversion logic here (same as in notebook)
         | 
| 22 | 
            +
                compression_configs = {
         | 
| 23 | 
            +
                    "qwen2.5-0.5b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0},
         | 
| 24 | 
            +
                    "default": {"sym": False, "group_size": 128, "ratio": 0.8},
         | 
| 25 | 
            +
                }
         | 
| 26 | 
            +
                model_compression_params = compression_configs.get(model_id, compression_configs["default"])
         | 
| 27 | 
            +
                
         | 
| 28 | 
            +
                # Example conversion logic
         | 
| 29 | 
            +
                int4_model_dir = Path(model_id) / "INT4_compressed_weights"
         | 
| 30 | 
            +
                if (int4_model_dir / "openvino_model.xml").exists():
         | 
| 31 | 
            +
                    return int4_model_dir
         | 
| 32 | 
            +
                remote_code = model_configuration.get("remote_code", False)
         | 
| 33 | 
            +
                export_command_base = f"optimum-cli export openvino --model {model_configuration['model_id']} --task text-generation-with-past --weight-format int4"
         | 
| 34 | 
            +
                int4_compression_args = f" --group-size {model_compression_params['group_size']} --ratio {model_compression_params['ratio']}"
         | 
| 35 | 
            +
                if model_compression_params["sym"]:
         | 
| 36 | 
            +
                    int4_compression_args += " --sym"
         | 
| 37 | 
            +
                if enable_awq:
         | 
| 38 | 
            +
                    int4_compression_args += " --awq --dataset wikitext2 --num-samples 128"
         | 
| 39 | 
            +
                export_command_base += int4_compression_args
         | 
| 40 | 
            +
                if remote_code:
         | 
| 41 | 
            +
                    export_command_base += " --trust-remote-code"
         | 
| 42 | 
            +
                export_command = export_command_base + f" {str(int4_model_dir)}"
         | 
| 43 | 
            +
                
         | 
| 44 | 
            +
                # Execute export command (shell command)
         | 
| 45 | 
            +
                os.system(export_command)
         | 
| 46 | 
            +
                return int4_model_dir
         | 
| 47 |  | 
|  | |
|  | |
|  | |
|  | |
| 48 |  | 
| 49 | 
            +
            # Model and tokenizer loading
         | 
| 50 | 
            +
            def load_model(model_dir, device):
         | 
| 51 | 
            +
                # Load model using OpenVINO
         | 
| 52 | 
            +
                ov_config = {hints.performance_mode(): hints.PerformanceMode.LATENCY, streams.num(): "1", props.cache_dir(): ""}
         | 
| 53 | 
            +
                core = ov.Core()
         | 
| 54 | 
            +
                model_name = model_configuration["model_id"]
         | 
| 55 | 
            +
                tok = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
         | 
| 56 |  | 
| 57 | 
            +
                ov_model = OVModelForCausalLM.from_pretrained(
         | 
| 58 | 
            +
                    model_dir,
         | 
| 59 | 
            +
                    device=device,
         | 
| 60 | 
            +
                    ov_config=ov_config,
         | 
| 61 | 
            +
                    config=AutoConfig.from_pretrained(model_dir, trust_remote_code=True),
         | 
| 62 | 
            +
                    trust_remote_code=True,
         | 
| 63 | 
            +
                )
         | 
| 64 |  | 
| 65 | 
            +
                return ov_model, tok
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 66 |  | 
| 67 | 
            +
            # Define the bot function that interacts with Gradio UI
         | 
| 68 | 
             
            def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id):
         | 
|  | |
|  | |
|  | |
| 69 | 
             
                input_ids = convert_history_to_token(history)
         | 
| 70 | 
            +
                if input_ids.shape[1] > 2000:
         | 
| 71 | 
            +
                    history = [history[-1]]  # Limit input size
         | 
| 72 | 
            +
                    input_ids = convert_history_to_token(history)
         | 
| 73 | 
            +
                
         | 
| 74 | 
             
                streamer = TextIteratorStreamer(tok, timeout=3600.0, skip_prompt=True, skip_special_tokens=True)
         | 
| 75 | 
            +
                
         | 
| 76 | 
             
                generate_kwargs = dict(
         | 
| 77 | 
             
                    input_ids=input_ids,
         | 
| 78 | 
             
                    max_new_tokens=256,
         | 
|  | |
| 83 | 
             
                    repetition_penalty=repetition_penalty,
         | 
| 84 | 
             
                    streamer=streamer,
         | 
| 85 | 
             
                )
         | 
| 86 | 
            +
             | 
| 87 | 
            +
                # Function to generate response in a separate thread
         | 
| 88 | 
            +
                def generate_and_signal_complete():
         | 
| 89 | 
            +
                    ov_model.generate(**generate_kwargs)
         | 
| 90 | 
            +
                    stream_complete.set()
         | 
| 91 | 
            +
             | 
| 92 | 
            +
                t1 = Thread(target=generate_and_signal_complete)
         | 
| 93 | 
            +
                t1.start()
         | 
| 94 | 
            +
             | 
| 95 | 
            +
                # Process partial text and return updated history
         | 
| 96 | 
             
                partial_text = ""
         | 
| 97 | 
             
                for new_text in streamer:
         | 
| 98 | 
            +
                    partial_text = text_processor(partial_text, new_text)
         | 
| 99 | 
             
                    history[-1][1] = partial_text
         | 
| 100 | 
             
                    yield history
         | 
| 101 |  | 
| 102 | 
            +
            # Gradio interface setup
         | 
| 103 | 
            +
            def create_gradio_interface():
         | 
| 104 | 
            +
                model_language = SUPPORTED_LLM_MODELS.keys()  # List of model languages
         | 
| 105 | 
            +
                model_id = widgets.Dropdown(options=model_language, value=model_language[0], description="Model Language:")
         | 
| 106 | 
            +
                
         | 
| 107 | 
            +
                # Choose model based on the selected language
         | 
| 108 | 
            +
                model_configuration = SUPPORTED_LLM_MODELS[model_language[0]][model_id.value]
         | 
| 109 | 
            +
                
         | 
| 110 | 
            +
                # Prepare model (convert to INT4, etc.)
         | 
| 111 | 
            +
                int4_model_dir = convert_to_int4(model_id.value, model_configuration)
         | 
| 112 | 
            +
                
         | 
| 113 | 
            +
                # Load model and tokenizer
         | 
| 114 | 
            +
                device = device_widget("CPU")
         | 
| 115 | 
            +
                ov_model, tok = load_model(int4_model_dir, device)
         | 
| 116 | 
            +
                
         | 
| 117 | 
            +
                # Create the Gradio app
         | 
| 118 | 
            +
                demo = make_demo(run_fn=bot, stop_fn=request_cancel, title=f"OpenVINO Chatbot", language=model_language[0])
         | 
| 119 | 
            +
             | 
| 120 | 
            +
                return demo
         | 
| 121 | 
            +
             | 
| 122 | 
            +
            # Run the Gradio app
         | 
| 123 | 
            +
            if __name__ == "__main__":
         | 
| 124 | 
            +
                app = create_gradio_interface()
         | 
| 125 | 
            +
                app.launch(debug=True, share=True)  # share=True for public access
         | 
| 126 |  | 
|  | |
|  | |
|  |