arlineka
/

manbasya_2x7b_MOE

Text Generation

text-generation-inference

Inference Endpoints

4-bit precision

Model card Files Files and versions Community

arlineka commited on Feb 11, 2024

Commit

1621f99

·

verified ·

1 Parent(s): 94315ce

Update README.md

Files changed (1) hide show

README.md +31 -18

README.md CHANGED Viewed

@@ -2,24 +2,37 @@
 license: apache-2.0
 ---
 ```
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
-import math
-model_path = "arlineka/manbasya_2x7b_MOE"
-tokenizer = AutoTokenizer.from_pretrained(model_path, use_default_system_prompt=False)
-model = AutoModelForCausalLM.from_pretrained(
-    model_path, torch_dtype=torch.float32, device_map='auto',local_files_only=False, load_in_4bit=True
-)
-print(model)
-prompt = input("please input prompt:")
-while len(prompt) > 0:
-  input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
-  generation_output = model.generate(
-    input_ids=input_ids, max_new_tokens=1024,repetition_penalty=1.2
-  )
-  print(tokenizer.decode(generation_output[0]))
-  prompt = input("please input prompt:")
 ```

 license: apache-2.0
 ---
+AWQ Quantized
+```
+!pip install  git+https://github.com/huggingface/transformers.git -q
+!pip install huggingface_hub
+!pip install autoawq -q
 ```
+```
+from awq import AutoAWQForCausalLM
+from transformers import AutoTokenizer
 import torch
+# Assuming your model and tokenizer are loaded
+model_name_or_path = "arlineka/manbasya_2x7b_MOE"
+model = AutoAWQForCausalLM.from_quantized(model_name_or_path, fuse_layer=True, trust_remote_code=False, safetensors=True)
+tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=False)
+# Set device to CUDA if available
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Move model to the device
+model.to(device)
+# Prepare your input text and move input tensors to the same device
+input_text = "Hello. Input Here"
+input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
+# Now generate text with model and input tensors on the same device
+output = model.generate(input_ids, max_new_tokens=2048)  # Example usage, adjust as necessary
+generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
+print(generated_text)
 ```