Spaces:
Sleeping
Sleeping
import os | |
import re | |
import json | |
from groq import Groq | |
def process_and_save_json(input_file_path, output_file_path, api_key, chunk_size=2048, overlap_size=256, model="llama3-8b-8192", about='Events', details=['name', 'details']): | |
""" | |
Processes the input file in overlapping chunks, interacts with the model, and saves the JSON output to a file. | |
Parameters: | |
input_file_path (str): Path to the input file containing data. | |
output_file_path (str): Path to the output JSON file. | |
api_key (str): Groq API key for authentication. | |
chunk_size (int): Size of each chunk of data. | |
overlap_size (int): Number of characters to overlap between chunks. | |
model (str): Model identifier to use for processing. | |
about (str): Description of the data for the model. | |
details (list or str): List of column names expected in the output JSON or a specific condition. | |
""" | |
# Initialize the Groq client with the provided API key | |
client = Groq(api_key=api_key) | |
def read_file_in_chunks(file_path, chunk_size, overlap_size): | |
"""Reads the file in chunks of a specified size with overlapping.""" | |
with open(file_path, 'r', encoding='utf-8') as f: | |
buffer = f.read() | |
start = 0 | |
while start < len(buffer): | |
end = start + chunk_size | |
yield buffer[start:end] | |
start = end - overlap_size # Move the start point for the next chunk | |
def extract_text_between_braces(text): | |
"""Extracts and returns all text between curly braces.""" | |
matches = re.findall(r'\{.*?\}', text, re.DOTALL) | |
return matches | |
def ensure_strings_in_json(data): | |
"""Ensure that all values in JSON are strings.""" | |
if isinstance(data, dict): | |
return {k: str(v) if not isinstance(v, (dict, list)) else ensure_strings_in_json(v) for k, v in data.items()} | |
elif isinstance(data, list): | |
return [ensure_strings_in_json(item) for item in data] | |
return str(data) | |
def process_chunk(client, chunk, model, about, details): | |
"""Sends a chunk to the model and returns the completion.""" | |
system_message = ( | |
f"You are a helpful assistant for cleaning and organizing the data. \n" | |
f"This data is about {about}. \n" | |
f"Output should be well organized in the form of JSON with the following columns: {', '.join(details)}.\n" | |
f"Do not add extra details apart from JSON.\n" | |
f"If there is no such data, return an empty list.\n" | |
) | |
completion = client.chat.completions.create( | |
model=model, | |
messages=[ | |
{ | |
"role": "system", | |
"content": system_message | |
}, | |
{ | |
"role": "user", | |
"content": chunk | |
} | |
], | |
temperature=1, | |
max_tokens=8192, | |
top_p=1, | |
stream=False, | |
stop=None, | |
) | |
# Accessing the message content using dot notation | |
return completion.choices[0].message.content | |
combined_output = [] | |
# Read and process the file in chunks | |
for chunk in read_file_in_chunks(input_file_path, chunk_size, overlap_size): | |
output = process_chunk(client, chunk, model, about, details) | |
# Extract all text between curly braces from each chunk output | |
brace_texts = extract_text_between_braces(output) | |
for brace_text in brace_texts: | |
try: | |
# Parse JSON and ensure all values are strings | |
json_data = json.loads(brace_text) | |
json_data = ensure_strings_in_json(json_data) | |
combined_output.append(json_data) | |
except json.JSONDecodeError: | |
print("+++++++++++++++++++++++++++++++++++++++++++++++++") | |
print("Invalid JSON format in extracted text:") | |
print(brace_text) | |
print("+++++++++++++++++++++++++++++++++++++++++++++++++") | |
# Output the combined result to a JSON file | |
with open(output_file_path, 'w', encoding='utf-8') as f: | |
json.dump(combined_output, f, indent=4) | |
print(f"Processing complete. Output saved to '{output_file_path}'.") | |