Spaces:
Running
Running
import gradio as gr | |
import json | |
from transformers import pipeline, AutoTokenizer | |
# Model seçenekleri | |
MODEL_OPTIONS = { | |
"BART (10k-BART)": { | |
"model_id": "zaferisikli/10k-bart-summary", | |
"max_tokens": 1020, | |
"max_length": 250, | |
"min_length": 70 | |
}, | |
"PEGASUS (10k-Pegasus)": { | |
"model_id": "zaferisikli/10k-pegasus-summary", | |
"max_tokens": 512, | |
"max_length": 256, | |
"min_length": 64 | |
} | |
} | |
# Hedeflenen bölümler | |
target_items = ["item_1", "item_1A", "item_7", "item_8", "item_9A"] | |
# Yanlış şirket adlarını düzeltme fonksiyonu | |
def fix_company_names(text, correct_name="Quotient Technology Inc."): | |
wrong_names = [ | |
"Bridgeline", "Guidewire", "AppFolio", "GTY", | |
"GTY Technology Holdings Inc." | |
] | |
for wrong in wrong_names: | |
text = text.replace(wrong, correct_name) | |
return text | |
# Token sayısına göre metni kırpma | |
def safe_trim(text, tokenizer, max_tokens): | |
input_ids = tokenizer.encode(text, truncation=False) | |
input_ids = input_ids[:max_tokens] | |
return tokenizer.decode(input_ids, skip_special_tokens=True) | |
# Özetleme işlemi | |
def summarize_json(json_file, selected_model, is_turkish): | |
model_cfg = MODEL_OPTIONS[selected_model] | |
model_id = model_cfg["model_id"] | |
max_tokens = model_cfg["max_tokens"] | |
max_len = model_cfg["max_length"] | |
min_len = model_cfg["min_length"] | |
summarizer = pipeline("summarization", model=model_id) | |
tokenizer = AutoTokenizer.from_pretrained(model_id) | |
if is_turkish: | |
tr2en = pipeline("translation", model="Helsinki-NLP/opus-mt-tc-big-tr-en") | |
en2tr = pipeline("translation", model="Helsinki-NLP/opus-mt-tc-big-en-tr") | |
with open(json_file.name, "r", encoding="utf-8") as f: | |
data = json.load(f) | |
summary_output = {} | |
for item in target_items: | |
raw = data.get(item, "").strip() | |
if not raw: | |
summary_output[item] = "EMPTY" | |
continue | |
if is_turkish: | |
raw = tr2en(raw)[0]["translation_text"] | |
trimmed = safe_trim(raw, tokenizer, max_tokens) | |
eng_summary = summarizer(trimmed, max_length=max_len, min_length=min_len, do_sample=False)[0]["summary_text"] | |
final = en2tr(eng_summary)[0]["translation_text"] if is_turkish else eng_summary | |
# Şirket adı düzeltmesi burada uygulanır | |
summary_output[item] = fix_company_names(final) | |
output_path = "summary_output.json" | |
with open(output_path, "w", encoding="utf-8") as f: | |
json.dump({"summary": summary_output}, f, indent=2) | |
readable_text = f" Summary generated by: {selected_model}\n\n" | |
readable_text += "\n\n".join([f"### {k.upper()}\n{v}" for k, v in summary_output.items()]) | |
return output_path, readable_text | |
# Gradio Arayüz | |
with gr.Blocks(title="10-K Summary Generator") as demo: | |
gr.HTML("<h2 style='text-align: center;'> 10-K Financial Report Summarizer</h2>") | |
with gr.Row(): | |
model_selector = gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), value="BART (10k-BART)", label="Select Model") | |
is_turkish = gr.Checkbox(label="Report is in Turkish 🇹🇷", value=False) | |
with gr.Row(): | |
input_file = gr.File(label="📤 Upload 10-K JSON", file_types=[".json"]) | |
output_file = gr.File(label="📥 Download Summary JSON") | |
with gr.Row(): | |
summarize_btn = gr.Button(" Generate Summary") | |
with gr.Accordion(" Show Summary Text", open=False): | |
summary_text = gr.Textbox(label="", lines=25, show_label=False) | |
summarize_btn.click( | |
fn=summarize_json, | |
inputs=[input_file, model_selector, is_turkish], | |
outputs=[output_file, summary_text] | |
) | |
demo.launch() | |