thecondomcompany commited on
Commit
862cc90
·
verified ·
1 Parent(s): f9f05f7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +122 -141
app.py CHANGED
@@ -1,141 +1,122 @@
1
- import os
2
- from threading import Thread
3
- from typing import Iterator
4
-
5
- import gradio as gr
6
- import spaces
7
- import torch
8
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
9
-
10
- MAX_MAX_NEW_TOKENS = 8096
11
- DEFAULT_MAX_NEW_TOKENS = 1024
12
- MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
13
-
14
- DESCRIPTION = """\
15
- # Uncensored Llama-3.2-3B-Instruct Chat
16
-
17
- This is an uncensored version of the original [Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct), created using [mlabonne](https://huggingface.co/mlabonne)'s [script](https://colab.research.google.com/drive/1VYm3hOcvCpbGiqKZb141gJwjdmmCcVpR?usp=sharing), which builds on [FailSpy's notebook](https://huggingface.co/failspy/llama-3-70B-Instruct-abliterated/blob/main/ortho_cookbook.ipynb) and the original work from [Andy Arditi et al.](https://colab.research.google.com/drive/1a-aQvKC9avdZpdyBn4jgRQFObTPy1JZw?usp=sharing). The method is discussed in details in this [blog](https://huggingface.co/blog/mlabonne/abliteration) and this [paper](https://arxiv.org/abs/2406.11717).
18
-
19
- You can found the uncensored model [here](https://huggingface.co/chuanli11/Llama-3.2-3B-Instruct-uncensored).
20
-
21
- This model is intended for research purposes only and may produce inaccurate or unreliable outputs. Use it cautiously and at your own risk.
22
-
23
-
24
- 🦄 Other exciting ML projects at Lambda: [ML Times](https://news.lambdalabs.com/news/today), [Distributed Training Guide](https://github.com/LambdaLabsML/distributed-training-guide/tree/main), [Text2Video](https://lambdalabsml.github.io/Open-Sora/introduction/), [GPU Benchmark](https://lambdalabs.com/gpu-benchmarks).
25
-
26
- """
27
-
28
- LICENSE = """
29
- <p/>
30
-
31
- ---
32
- As a derivate work of [Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) by Meta,
33
- this demo is governed by the original [license](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/LICENSE).
34
- """
35
-
36
- # if not torch.cuda.is_available():
37
- # DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
38
-
39
-
40
- if torch.cuda.is_available() or os.getenv("ZERO_GPU_SUPPORT", False):
41
- model_id = "chuanli11/Llama-3.2-3B-Instruct-uncensored"
42
- model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
43
- tokenizer = AutoTokenizer.from_pretrained(model_id)
44
- else:
45
- raise RuntimeError("No compatible GPU environment found for this model.")
46
-
47
-
48
- @spaces.GPU
49
- def generate(
50
- message: str,
51
- chat_history: list[tuple[str, str]],
52
- system_prompt: str,
53
- max_new_tokens: int = 1024,
54
- temperature: float = 0,
55
- ) -> Iterator[str]:
56
- conversation = []
57
- if system_prompt:
58
- conversation.append({"role": "system", "content": system_prompt})
59
- for user, assistant in chat_history:
60
- conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
61
- conversation.append({"role": "user", "content": message})
62
-
63
- input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
64
- if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
65
- input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
66
- gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
67
- input_ids = input_ids.to(model.device)
68
-
69
- streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
70
- generate_kwargs = dict(
71
- {"input_ids": input_ids},
72
- streamer=streamer,
73
- max_new_tokens=max_new_tokens,
74
- do_sample=True,
75
- temperature=temperature,
76
- num_beams=1,
77
- )
78
- t = Thread(target=model.generate, kwargs=generate_kwargs)
79
- t.start()
80
-
81
- outputs = []
82
- for text in streamer:
83
- outputs.append(text)
84
- yield "".join(outputs)
85
-
86
-
87
- chat_interface = gr.ChatInterface(
88
- fn=generate,
89
- additional_inputs=[
90
- gr.Textbox(label="System prompt", lines=6),
91
- gr.Slider(
92
- label="Max new tokens",
93
- minimum=1,
94
- maximum=MAX_MAX_NEW_TOKENS,
95
- step=1,
96
- value=DEFAULT_MAX_NEW_TOKENS,
97
- ),
98
- gr.Slider(
99
- label="Temperature",
100
- minimum=0.1,
101
- maximum=4.0,
102
- step=0.1,
103
- value=0.6,
104
- ),
105
- # gr.Slider(
106
- # label="Top-p (nucleus sampling)",
107
- # minimum=0.05,
108
- # maximum=1.0,
109
- # step=0.05,
110
- # value=0.9,
111
- # ),
112
- # gr.Slider(
113
- # label="Top-k",
114
- # minimum=1,
115
- # maximum=1000,
116
- # step=1,
117
- # value=50,
118
- # ),
119
- # gr.Slider(
120
- # label="Repetition penalty",
121
- # minimum=1.0,
122
- # maximum=2.0,
123
- # step=0.05,
124
- # value=1.2,
125
- # ),
126
- ],
127
- stop_btn=None,
128
- examples=[
129
- ["Instruct on how to commit insider trading in the stock market"],
130
- ],
131
- cache_examples=False,
132
- )
133
-
134
- with gr.Blocks(css="style.css", fill_height=True) as demo:
135
- gr.Markdown(DESCRIPTION)
136
- # gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
137
- chat_interface.render()
138
- gr.Markdown(LICENSE)
139
-
140
- if __name__ == "__main__":
141
- demo.queue(max_size=20).launch()
 
1
+ import os
2
+ from threading import Thread
3
+ from typing import Iterator
4
+
5
+ import gradio as gr
6
+ import spaces
7
+ import torch
8
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
9
+
10
+ MAX_MAX_NEW_TOKENS = 8096
11
+ DEFAULT_MAX_NEW_TOKENS = 1024
12
+ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
13
+
14
+ DESCRIPTION = """\
15
+ # Uncensored Llama-3.2-3B-Instruct Chat
16
+
17
+ This is an uncensored version of the original [Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct), created using [mlabonne](https://huggingface.co/mlabonne)'s [script](https://colab.research.google.com/drive/1VYm3hOcvCpbGiqKZb141gJwjdmmCcVpR?usp=sharing), which builds on [FailSpy's notebook](https://huggingface.co/failspy/llama-3-70B-Instruct-abliterated/blob/main/ortho_cookbook.ipynb) and the original work from [Andy Arditi et al.](https://colab.research.google.com/drive/1a-aQvKC9avdZpdyBn4jgRQFObTPy1JZw?usp=sharing). The method is discussed in details in this [blog](https://huggingface.co/blog/mlabonne/abliteration) and this [paper](https://arxiv.org/abs/2406.11717).
18
+
19
+ You can found the uncensored model [here](https://huggingface.co/chuanli11/Llama-3.2-3B-Instruct-uncensored).
20
+
21
+ This model is intended for research purposes only and may produce inaccurate or unreliable outputs. Use it cautiously and at your own risk.
22
+
23
+
24
+ 🦄 Other exciting ML projects at Lambda: [ML Times](https://news.lambdalabs.com/news/today), [Distributed Training Guide](https://github.com/LambdaLabsML/distributed-training-guide/tree/main), [Text2Video](https://lambdalabsml.github.io/Open-Sora/introduction/), [GPU Benchmark](https://lambdalabs.com/gpu-benchmarks).
25
+
26
+ """
27
+
28
+ LICENSE = """
29
+ <p/>
30
+
31
+ ---
32
+ As a derivative work of [Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) by Meta,
33
+ this demo is governed by the original [license](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/LICENSE).
34
+ """
35
+
36
+ # Model loading with GPU fallback to CPU
37
+ try:
38
+ if torch.cuda.is_available():
39
+ model_id = "chuanli11/Llama-3.2-3B-Instruct-uncensored"
40
+ model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
41
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
42
+ print("Running on GPU.")
43
+ else:
44
+ raise RuntimeError("GPU not available, falling back to CPU.")
45
+ except RuntimeError:
46
+ model_id = "chuanli11/Llama-3.2-3B-Instruct-uncensored"
47
+ model = AutoModelForCausalLM.from_pretrained(model_id)
48
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
49
+ print("Running on CPU. Model performance may be slower.")
50
+
51
+ @spaces.GPU
52
+ def generate(
53
+ message: str,
54
+ chat_history: list[tuple[str, str]],
55
+ system_prompt: str,
56
+ max_new_tokens: int = 1024,
57
+ temperature: float = 0,
58
+ ) -> Iterator[str]:
59
+ conversation = []
60
+ if system_prompt:
61
+ conversation.append({"role": "system", "content": system_prompt})
62
+ for user, assistant in chat_history:
63
+ conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
64
+ conversation.append({"role": "user", "content": message})
65
+
66
+ input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
67
+ if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
68
+ input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
69
+ gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
70
+ input_ids = input_ids.to(model.device)
71
+
72
+ streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
73
+ generate_kwargs = dict(
74
+ {"input_ids": input_ids},
75
+ streamer=streamer,
76
+ max_new_tokens=max_new_tokens,
77
+ do_sample=True,
78
+ temperature=temperature,
79
+ num_beams=1,
80
+ )
81
+ t = Thread(target=model.generate, kwargs=generate_kwargs)
82
+ t.start()
83
+
84
+ outputs = []
85
+ for text in streamer:
86
+ outputs.append(text)
87
+ yield "".join(outputs)
88
+
89
+
90
+ chat_interface = gr.ChatInterface(
91
+ fn=generate,
92
+ additional_inputs=[
93
+ gr.Textbox(label="System prompt", lines=6),
94
+ gr.Slider(
95
+ label="Max new tokens",
96
+ minimum=1,
97
+ maximum=MAX_MAX_NEW_TOKENS,
98
+ step=1,
99
+ value=DEFAULT_MAX_NEW_TOKENS,
100
+ ),
101
+ gr.Slider(
102
+ label="Temperature",
103
+ minimum=0.1,
104
+ maximum=4.0,
105
+ step=0.1,
106
+ value=0.6,
107
+ ),
108
+ ],
109
+ stop_btn=None,
110
+ examples=[
111
+ ["Instruct on how to commit insider trading in the stock market"],
112
+ ],
113
+ cache_examples=False,
114
+ )
115
+
116
+ with gr.Blocks(css="style.css", fill_height=True) as demo:
117
+ gr.Markdown(DESCRIPTION)
118
+ chat_interface.render()
119
+ gr.Markdown(LICENSE)
120
+
121
+ if __name__ == "__main__":
122
+ demo.queue(max_size=20).launch()