Contextual-Policy-Engine-Hate-Speech-Classification
/
content_moderation_models
/openai_moderation.py
| ## Expects a csv passed as command line argument with columns 'prompt' and 'response' and outputs csv with ratings appended. | |
| ## Define Open AI Key in key.env file as cgpt_key=" " or hardcode in script. | |
| import openai | |
| import pandas as pd | |
| import os | |
| import time | |
| from datetime import datetime | |
| from dotenv import load_dotenv | |
| import sys | |
| # Load environment variables (Ensure your API key is in key.env) | |
| load_dotenv("key.env") | |
| api_key = os.getenv("cgpt_key") # Get API key | |
| # Initialize OpenAI client | |
| client = openai.OpenAI(api_key=api_key) | |
| # Input & output file paths >> Set desired output name here | |
| INPUT_CSV = sys.argv[1] | |
| OUTPUT_CSV = "openai_librai_moderated_output.csv" | |
| # Function to check moderation | |
| def moderate_text(text): | |
| try: | |
| start_time = time.time() # Start timing | |
| response = client.moderations.create(input=text, model="omni-moderation-latest") | |
| end_time = time.time() # End timing | |
| moderation_result = response.results[0] | |
| return { | |
| "Flagged": moderation_result.flagged, | |
| "Hate": moderation_result.categories.hate, | |
| "Hate/Threatening": moderation_result.categories.hate_threatening, | |
| "Harassment": moderation_result.categories.harassment, | |
| "Harassment/Threatening": moderation_result.categories.harassment_threatening, | |
| "Self-Harm": moderation_result.categories.self_harm, | |
| "Self-Harm/Intent": moderation_result.categories.self_harm_intent, | |
| "Self-Harm/Instructions": moderation_result.categories.self_harm_instructions, | |
| "Violence": moderation_result.categories.violence, | |
| "Violence/Graphic": moderation_result.categories.violence_graphic, | |
| "Sexual": moderation_result.categories.sexual, | |
| "Sexual/Minors": moderation_result.categories.sexual_minors, | |
| "Illicit": moderation_result.categories.illicit, | |
| "Illicit/Violent": moderation_result.categories.illicit_violent, | |
| "Processing_Time (s)": round(end_time - start_time, 4), | |
| "Timestamp": datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S"), | |
| "Model": response.model, | |
| "Response_ID": response.id | |
| } | |
| except Exception as e: | |
| print(f"Error processing text: {text} -> {e}") | |
| return None # Return None if an error occurs | |
| # Load dataset | |
| df = pd.read_csv(INPUT_CSV) | |
| # Create an output CSV | |
| if not os.path.exists(OUTPUT_CSV): | |
| columns = ["prompt", "response", "Flagged", "Hate", "Hate/Threatening", | |
| "Harassment", "Harassment/Threatening", "Self-Harm", "Self-Harm/Intent", | |
| "Self-Harm/Instructions", "Violence", "Violence/Graphic", "Sexual", | |
| "Sexual/Minors", "Illicit", "Illicit/Violent", | |
| "Processing_Time (s)", "Timestamp", "Model", "Response_ID"] | |
| pd.DataFrame(columns=columns).to_csv(OUTPUT_CSV, index=False) | |
| # Process each row | |
| for index, row in df.iterrows(): | |
| prompt = row["prompt"] | |
| response = row["response"] | |
| # Moderate the response | |
| moderation_results = moderate_text(response) | |
| if moderation_results: | |
| row_data = { | |
| "prompt": prompt, | |
| "response": response, | |
| **moderation_results # Expands the dictionary into individual columns | |
| } | |
| # Append to CSV | |
| pd.DataFrame([row_data]).to_csv(OUTPUT_CSV, mode="a", header=False, index=False) | |
| # Print progress | |
| print(f"Processed row {index+1}/{len(df)} - Flagged: {moderation_results['Flagged']} - Time: {moderation_results['Processing_Time (s)']}s") | |
| print(f"✅ Moderation complete. Results saved to {OUTPUT_CSV}") | |