import os import re import json from groq import Groq def process_and_save_json(input_file_path, output_file_path, api_key, chunk_size=2048, overlap_size=256, model="llama3-8b-8192", about='Events', details=['name', 'details']): """ Processes the input file in overlapping chunks, interacts with the model, and saves the JSON output to a file. Parameters: input_file_path (str): Path to the input file containing data. output_file_path (str): Path to the output JSON file. api_key (str): Groq API key for authentication. chunk_size (int): Size of each chunk of data. overlap_size (int): Number of characters to overlap between chunks. model (str): Model identifier to use for processing. about (str): Description of the data for the model. details (list or str): List of column names expected in the output JSON or a specific condition. """ # Initialize the Groq client with the provided API key client = Groq(api_key=api_key) def read_file_in_chunks(file_path, chunk_size, overlap_size): """Reads the file in chunks of a specified size with overlapping.""" with open(file_path, 'r', encoding='utf-8') as f: buffer = f.read() start = 0 while start < len(buffer): end = start + chunk_size yield buffer[start:end] start = end - overlap_size # Move the start point for the next chunk def extract_text_between_braces(text): """Extracts and returns all text between curly braces.""" matches = re.findall(r'\{.*?\}', text, re.DOTALL) return matches def ensure_strings_in_json(data): """Ensure that all values in JSON are strings.""" if isinstance(data, dict): return {k: str(v) if not isinstance(v, (dict, list)) else ensure_strings_in_json(v) for k, v in data.items()} elif isinstance(data, list): return [ensure_strings_in_json(item) for item in data] return str(data) def process_chunk(client, chunk, model, about, details): """Sends a chunk to the model and returns the completion.""" system_message = ( f"You are a helpful assistant for cleaning and organizing the data. \n" f"This data is about {about}. \n" f"Output should be well organized in the form of JSON with the following columns: {', '.join(details)}.\n" f"Do not add extra details apart from JSON.\n" f"If there is no such data, return an empty list.\n" ) completion = client.chat.completions.create( model=model, messages=[ { "role": "system", "content": system_message }, { "role": "user", "content": chunk } ], temperature=1, max_tokens=8192, top_p=1, stream=False, stop=None, ) # Accessing the message content using dot notation return completion.choices[0].message.content combined_output = [] # Read and process the file in chunks for chunk in read_file_in_chunks(input_file_path, chunk_size, overlap_size): output = process_chunk(client, chunk, model, about, details) # Extract all text between curly braces from each chunk output brace_texts = extract_text_between_braces(output) for brace_text in brace_texts: try: # Parse JSON and ensure all values are strings json_data = json.loads(brace_text) json_data = ensure_strings_in_json(json_data) combined_output.append(json_data) except json.JSONDecodeError: print("+++++++++++++++++++++++++++++++++++++++++++++++++") print("Invalid JSON format in extracted text:") print(brace_text) print("+++++++++++++++++++++++++++++++++++++++++++++++++") # Output the combined result to a JSON file with open(output_file_path, 'w', encoding='utf-8') as f: json.dump(combined_output, f, indent=4) print(f"Processing complete. Output saved to '{output_file_path}'.")