oieieio
/

Orca-2-13b-awq

@@ -106,133 +106,59 @@ This model is solely designed for research settings, and its testing has only be
 out in such environments. It should not be used in downstream applications, as additional
 analysis is needed to assess potential harm or bias in the proposed application.
-## Getting started with Orca 2
 **Inference with Hugging Face library**
 ```python
 import torch
-import transformers
-if torch.cuda.is_available():
-    torch.set_default_device("cuda")
-else:
-    torch.set_default_device("cpu")
-model = transformers.AutoModelForCausalLM.from_pretrained("microsoft/Orca-2-13b", device_map='auto')
-# https://github.com/huggingface/transformers/issues/27132
-# please use the slow tokenizer since fast and slow tokenizer produces different tokens
-tokenizer = transformers.AutoTokenizer.from_pretrained(
-        "microsoft/Orca-2-13b",
-        use_fast=False,
-    )
 system_message = "You are Orca, an AI language model created by Microsoft. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior."
 user_message = "How can you determine if a restaurant is popular among locals or mainly attracts tourists, and why might this information be useful?"
-prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"
-inputs = tokenizer(prompt, return_tensors='pt')
-output_ids = model.generate(inputs["input_ids"],)
-answer = tokenizer.batch_decode(output_ids)[0]
 print(answer)
-# This example continues showing how to add a second turn message by the user to the conversation
 second_turn_user_message = "Give me a list of the key points of your first answer."
-# we set add_special_tokens=False because we dont want to automatically add a bos_token between messages
-second_turn_message_in_markup = f"\n<|im_start|>user\n{second_turn_user_message}<|im_end|>\n<|im_start|>assistant"
-second_turn_tokens = tokenizer(second_turn_message_in_markup, return_tensors='pt', add_special_tokens=False)
-second_turn_input = torch.cat([output_ids, second_turn_tokens['input_ids']], dim=1)
-output_ids_2 = model.generate(second_turn_input,)
-second_turn_answer = tokenizer.batch_decode(output_ids_2)[0]
-print(second_turn_answer)
-```
-**Safe inference with Azure AI Content Safety**
-The usage of [Azure AI Content Safety](https://azure.microsoft.com/en-us/products/ai-services/ai-content-safety/) on top of model prediction is strongly encouraged
-and can help prevent content harms. Azure AI Content Safety is a content moderation platform
-that uses AI to keep your content safe. By integrating Orca 2 with Azure AI Content Safety,
-we can moderate the model output by scanning it for sexual content, violence, hate, and
-self-harm with multiple severity levels and multi-lingual detection.
-```python
-import os
-import math
-import transformers
-import torch
-from azure.ai.contentsafety import ContentSafetyClient
-from azure.core.credentials import AzureKeyCredential
-from azure.core.exceptions import HttpResponseError
-from azure.ai.contentsafety.models import AnalyzeTextOptions
-CONTENT_SAFETY_KEY = os.environ["CONTENT_SAFETY_KEY"]
-CONTENT_SAFETY_ENDPOINT = os.environ["CONTENT_SAFETY_ENDPOINT"]
-# We use Azure AI Content Safety to filter out any content that reaches "Medium" threshold
-# For more information: https://learn.microsoft.com/en-us/azure/ai-services/content-safety/
-def should_filter_out(input_text, threshold=4):
-    # Create an Content Safety client
-    client = ContentSafetyClient(CONTENT_SAFETY_ENDPOINT, AzureKeyCredential(CONTENT_SAFETY_KEY))
-    # Construct a request
-    request = AnalyzeTextOptions(text=input_text)
-    # Analyze text
-    try:
-        response = client.analyze_text(request)
-    except HttpResponseError as e:
-        print("Analyze text failed.")
-        if e.error:
-            print(f"Error code: {e.error.code}")
-            print(f"Error message: {e.error.message}")
-            raise
-        print(e)
-        raise
-    categories = ["hate_result", "self_harm_result", "sexual_result", "violence_result"]
-    max_score = -math.inf
-    for category in categories:
-        max_score = max(max_score, getattr(response, category).severity)
-    return max_score >= threshold
-model_path = 'microsoft/Orca-2-13b'
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-model = transformers.AutoModelForCausalLM.from_pretrained(model_path)
-model.to(device)
-tokenizer = transformers.AutoTokenizer.from_pretrained(
-    model_path,
-    model_max_length=4096,
-    padding_side="right",
-    use_fast=False,
-    add_special_tokens=False,
-)
-system_message = "You are Orca, an AI language model created by Microsoft. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior."
-user_message = "\" \n :You can't just say, \"\"that's crap\"\" and remove it without gaining a consensus. You already know this, based on your block history. —/ \" \nIs the comment obscene? \nOptions : Yes, No."
-prompt =  f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"
-inputs = tokenizer(prompt, return_tensors='pt')
-inputs = inputs.to(device)
-output_ids = model.generate(inputs["input_ids"], max_length=4096, do_sample=False, temperature=0.0, use_cache=True)
-sequence_length = inputs["input_ids"].shape[1]
-new_output_ids = output_ids[:, sequence_length:]
-answers = tokenizer.batch_decode(new_output_ids, skip_special_tokens=True)
-final_output = answers[0] if not should_filter_out(answers[0]) else "[Content Filtered]"
-print(final_output)
-```
 ## Citation
 ```bibtex

 out in such environments. It should not be used in downstream applications, as additional
 analysis is needed to assess potential harm or bias in the proposed application.
+## Getting started with Orca-2-13b-awq
 **Inference with Hugging Face library**
 ```python
 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+# Load the tokenizer and model from Hugging Face Hub
+quant_path = "oieieio/Orca-2-13b-awq"
+tokenizer = AutoTokenizer.from_pretrained(quant_path)
+model = AutoModelForCausalLM.from_pretrained(quant_path)
+# Move the model to GPU if available
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+model.to(device)
+# First turn of the conversation
 system_message = "You are Orca, an AI language model created by Microsoft. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior."
 user_message = "How can you determine if a restaurant is popular among locals or mainly attracts tourists, and why might this information be useful?"
+prompt = f"system\n{system_message}\nuser\n{user_message}\nassistant"
+# Encode the first prompt
+inputs = tokenizer(prompt, return_tensors='pt').to(device)
+output_ids = model.generate(inputs["input_ids"], max_length=512)
+# Decode the first response
+answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+# Print the first response
 print(answer)
+# Second turn of the conversation
 second_turn_user_message = "Give me a list of the key points of your first answer."
+# Append the second turn message to the already generated ids without adding special tokens
+second_turn_message_in_markup = f"\nuser\n{second_turn_user_message}\nassistant"
+second_turn_tokens = tokenizer(second_turn_message_in_markup, return_tensors='pt', add_special_tokens=False).to(device)
+second_turn_input_ids = torch.cat([output_ids, second_turn_tokens['input_ids']], dim=1)
+# Generate the second response
+output_ids_2 = model.generate(second_turn_input_ids, max_length=1024)
+# Decode the second response
+second_turn_answer = tokenizer.decode(output_ids_2[0], skip_special_tokens=True)
+# Print the second response
+print(second_turn_answer)
+```
 ## Citation
 ```bibtex