ParitKansal's picture
Add all files
f96e5ac
import os
import re
import json
from groq import Groq
def process_and_save_json(input_file_path, output_file_path, api_key, chunk_size=2048, overlap_size=256, model="llama3-8b-8192", about='Events', details=['name', 'details']):
"""
Processes the input file in overlapping chunks, interacts with the model, and saves the JSON output to a file.
Parameters:
input_file_path (str): Path to the input file containing data.
output_file_path (str): Path to the output JSON file.
api_key (str): Groq API key for authentication.
chunk_size (int): Size of each chunk of data.
overlap_size (int): Number of characters to overlap between chunks.
model (str): Model identifier to use for processing.
about (str): Description of the data for the model.
details (list or str): List of column names expected in the output JSON or a specific condition.
"""
# Initialize the Groq client with the provided API key
client = Groq(api_key=api_key)
def read_file_in_chunks(file_path, chunk_size, overlap_size):
"""Reads the file in chunks of a specified size with overlapping."""
with open(file_path, 'r', encoding='utf-8') as f:
buffer = f.read()
start = 0
while start < len(buffer):
end = start + chunk_size
yield buffer[start:end]
start = end - overlap_size # Move the start point for the next chunk
def extract_text_between_braces(text):
"""Extracts and returns all text between curly braces."""
matches = re.findall(r'\{.*?\}', text, re.DOTALL)
return matches
def ensure_strings_in_json(data):
"""Ensure that all values in JSON are strings."""
if isinstance(data, dict):
return {k: str(v) if not isinstance(v, (dict, list)) else ensure_strings_in_json(v) for k, v in data.items()}
elif isinstance(data, list):
return [ensure_strings_in_json(item) for item in data]
return str(data)
def process_chunk(client, chunk, model, about, details):
"""Sends a chunk to the model and returns the completion."""
system_message = (
f"You are a helpful assistant for cleaning and organizing the data. \n"
f"This data is about {about}. \n"
f"Output should be well organized in the form of JSON with the following columns: {', '.join(details)}.\n"
f"Do not add extra details apart from JSON.\n"
f"If there is no such data, return an empty list.\n"
)
completion = client.chat.completions.create(
model=model,
messages=[
{
"role": "system",
"content": system_message
},
{
"role": "user",
"content": chunk
}
],
temperature=1,
max_tokens=8192,
top_p=1,
stream=False,
stop=None,
)
# Accessing the message content using dot notation
return completion.choices[0].message.content
combined_output = []
# Read and process the file in chunks
for chunk in read_file_in_chunks(input_file_path, chunk_size, overlap_size):
output = process_chunk(client, chunk, model, about, details)
# Extract all text between curly braces from each chunk output
brace_texts = extract_text_between_braces(output)
for brace_text in brace_texts:
try:
# Parse JSON and ensure all values are strings
json_data = json.loads(brace_text)
json_data = ensure_strings_in_json(json_data)
combined_output.append(json_data)
except json.JSONDecodeError:
print("+++++++++++++++++++++++++++++++++++++++++++++++++")
print("Invalid JSON format in extracted text:")
print(brace_text)
print("+++++++++++++++++++++++++++++++++++++++++++++++++")
# Output the combined result to a JSON file
with open(output_file_path, 'w', encoding='utf-8') as f:
json.dump(combined_output, f, indent=4)
print(f"Processing complete. Output saved to '{output_file_path}'.")