huihui-ai commited on
Commit
48761b2
·
verified ·
1 Parent(s): 74dc7a0

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +184 -0
README.md CHANGED
@@ -15,6 +15,190 @@ tags:
15
 
16
 
17
  This is an uncensored version of [unsloth/gpt-oss-20b-BF16](https://huggingface.co/unsloth/gpt-oss-20b-BF16) created with abliteration (see [remove-refusals-with-transformers](https://github.com/Sumandora/remove-refusals-with-transformers) to know more about it).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  ## Usage Warnings
19
 
20
 
 
15
 
16
 
17
  This is an uncensored version of [unsloth/gpt-oss-20b-BF16](https://huggingface.co/unsloth/gpt-oss-20b-BF16) created with abliteration (see [remove-refusals-with-transformers](https://github.com/Sumandora/remove-refusals-with-transformers) to know more about it).
18
+
19
+ ## Usage
20
+ You can use this model in your applications by loading it with Hugging Face's `transformers` library:
21
+
22
+
23
+ ```python
24
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
25
+ import torch
26
+ import os
27
+ import signal
28
+ import random
29
+ import numpy as np
30
+ import time
31
+ from collections import Counter
32
+
33
+ cpu_count = os.cpu_count()
34
+ print(f"Number of CPU cores in the system: {cpu_count}")
35
+ half_cpu_count = cpu_count // 2
36
+ os.environ["MKL_NUM_THREADS"] = str(half_cpu_count)
37
+ os.environ["OMP_NUM_THREADS"] = str(half_cpu_count)
38
+ torch.set_num_threads(half_cpu_count)
39
+
40
+ print(f"PyTorch threads: {torch.get_num_threads()}")
41
+ print(f"MKL threads: {os.getenv('MKL_NUM_THREADS')}")
42
+ print(f"OMP threads: {os.getenv('OMP_NUM_THREADS')}")
43
+
44
+ # Load the model and tokenizer
45
+ NEW_MODEL_ID = "huihui-ai/Huihui-gpt-oss-20b-BF16-abliterated"
46
+ print(f"Load Model {NEW_MODEL_ID} ... ")
47
+
48
+ model = AutoModelForCausalLM.from_pretrained(
49
+ NEW_MODEL_ID,
50
+ device_map="auto",
51
+ trust_remote_code=True,
52
+ torch_dtype=torch.bfloat16,
53
+ low_cpu_mem_usage=True,
54
+ )
55
+ #print(model)
56
+ #print(model.config)
57
+
58
+ tokenizer = AutoTokenizer.from_pretrained(NEW_MODEL_ID, trust_remote_code=True)
59
+
60
+ messages = []
61
+ skip_prompt=False
62
+ skip_special_tokens=False
63
+ do_sample = True
64
+
65
+ class CustomTextStreamer(TextStreamer):
66
+ def __init__(self, tokenizer, skip_prompt=True, skip_special_tokens=True):
67
+ super().__init__(tokenizer, skip_prompt=skip_prompt, skip_special_tokens=skip_special_tokens)
68
+ self.generated_text = ""
69
+ self.stop_flag = False
70
+ self.init_time = time.time() # Record initialization time
71
+ self.end_time = None # To store end time
72
+ self.first_token_time = None # To store first token generation time
73
+ self.token_count = 0 # To track total tokens
74
+
75
+ def on_finalized_text(self, text: str, stream_end: bool = False):
76
+ if self.first_token_time is None and text.strip(): # Set first token time on first non-empty text
77
+ self.first_token_time = time.time()
78
+ self.generated_text += text
79
+ # Count tokens in the generated text
80
+ tokens = self.tokenizer.encode(text, add_special_tokens=False)
81
+ self.token_count += len(tokens)
82
+ print(text, end="", flush=True)
83
+ if stream_end:
84
+ self.end_time = time.time() # Record end time when streaming ends
85
+ if self.stop_flag:
86
+ raise StopIteration
87
+
88
+ def stop_generation(self):
89
+ self.stop_flag = True
90
+ self.end_time = time.time() # Record end time when generation is stopped
91
+
92
+ def get_metrics(self):
93
+ """Returns initialization time, first token time, first token latency, end time, total time, total tokens, and tokens per second."""
94
+ if self.end_time is None:
95
+ self.end_time = time.time() # Set end time if not already set
96
+ total_time = self.end_time - self.init_time # Total time from init to end
97
+ tokens_per_second = self.token_count / total_time if total_time > 0 else 0
98
+ first_token_latency = (self.first_token_time - self.init_time) if self.first_token_time is not None else None
99
+ metrics = {
100
+ "init_time": self.init_time,
101
+ "first_token_time": self.first_token_time,
102
+ "first_token_latency": first_token_latency,
103
+ "end_time": self.end_time,
104
+ "total_time": total_time, # Total time in seconds
105
+ "total_tokens": self.token_count,
106
+ "tokens_per_second": tokens_per_second
107
+ }
108
+ return metrics
109
+
110
+ def generate_stream(model, tokenizer, messages, skip_prompt, skip_special_tokens, do_sample, max_new_tokens):
111
+ input_ids = tokenizer.apply_chat_template(
112
+ messages,
113
+ add_generation_prompt=True,
114
+ return_tensors="pt",
115
+ return_dict=True,
116
+ ).to(model.device)
117
+
118
+ streamer = CustomTextStreamer(tokenizer, skip_prompt=skip_prompt, skip_special_tokens=skip_special_tokens)
119
+
120
+ def signal_handler(sig, frame):
121
+ streamer.stop_generation()
122
+ print("\n[Generation stopped by user with Ctrl+C]")
123
+
124
+ signal.signal(signal.SIGINT, signal_handler)
125
+
126
+ generate_kwargs = {}
127
+ if do_sample:
128
+ generate_kwargs = {
129
+ "do_sample": do_sample,
130
+ "max_length": max_new_tokens,
131
+ "temperature": 0.7,
132
+ "top_k": 20,
133
+ "top_p": 0.8,
134
+ "repetition_penalty": 1.2,
135
+ "no_repeat_ngram_size": 2
136
+ }
137
+ else:
138
+ generate_kwargs = {
139
+ "do_sample": do_sample,
140
+ "max_length": max_new_tokens,
141
+ "repetition_penalty": 1.2,
142
+ "no_repeat_ngram_size": 2
143
+ }
144
+
145
+
146
+ print("Response: ", end="", flush=True)
147
+ try:
148
+ generated_ids = model.generate(
149
+ **input_ids,
150
+ streamer=streamer,
151
+ **generate_kwargs
152
+ )
153
+ del generated_ids
154
+ except StopIteration:
155
+ print("\n[Stopped by user]")
156
+
157
+ del input_ids
158
+ torch.cuda.empty_cache()
159
+ signal.signal(signal.SIGINT, signal.SIG_DFL)
160
+
161
+ return streamer.generated_text, streamer.stop_flag, streamer.get_metrics()
162
+
163
+ while True:
164
+ print(f"skip_prompt: {skip_prompt}")
165
+ print(f"skip_special_tokens: {skip_special_tokens}")
166
+ print(f"do_sample: {do_sample}")
167
+
168
+ user_input = input("User: ").strip()
169
+ if user_input.lower() == "/exit":
170
+ print("Exiting chat.")
171
+ break
172
+ if user_input.lower() == "/clear":
173
+ messages = []
174
+ print("Chat history cleared. Starting a new conversation.")
175
+ continue
176
+ if user_input.lower() == "/skip_prompt":
177
+ skip_prompt = not skip_prompt
178
+ continue
179
+ if user_input.lower() == "/skip_special_tokens":
180
+ skip_special_tokens = not skip_special_tokens
181
+ continue
182
+ if user_input.lower() == "/do_sample":
183
+ do_sample = not do_sample
184
+ continue
185
+ if not user_input:
186
+ print("Input cannot be empty. Please enter something.")
187
+ continue
188
+
189
+
190
+ messages.append({"role": "user", "content": user_input})
191
+ response, stop_flag, metrics = generate_stream(model, tokenizer, messages, skip_prompt, skip_special_tokens, do_sample, 40960)
192
+ print("\n\nMetrics:")
193
+ for key, value in metrics.items():
194
+ print(f" {key}: {value}")
195
+
196
+ print("", flush=True)
197
+ if stop_flag:
198
+ continue
199
+ messages.append({"role": "assistant", "content": response})
200
+ ```
201
+
202
  ## Usage Warnings
203
 
204