10ksummary / app.py
zaferisikli's picture
Update app.py
c4f5bfa verified
import gradio as gr
import json
from transformers import pipeline, AutoTokenizer
# Model seçenekleri
MODEL_OPTIONS = {
"BART (10k-BART)": {
"model_id": "zaferisikli/10k-bart-summary",
"max_tokens": 1020,
"max_length": 250,
"min_length": 70
},
"PEGASUS (10k-Pegasus)": {
"model_id": "zaferisikli/10k-pegasus-summary",
"max_tokens": 512,
"max_length": 256,
"min_length": 64
}
}
# Hedeflenen bölümler
target_items = ["item_1", "item_1A", "item_7", "item_8", "item_9A"]
# Yanlış şirket adlarını düzeltme fonksiyonu
def fix_company_names(text, correct_name="Quotient Technology Inc."):
wrong_names = [
"Bridgeline", "Guidewire", "AppFolio", "GTY",
"GTY Technology Holdings Inc."
]
for wrong in wrong_names:
text = text.replace(wrong, correct_name)
return text
# Token sayısına göre metni kırpma
def safe_trim(text, tokenizer, max_tokens):
input_ids = tokenizer.encode(text, truncation=False)
input_ids = input_ids[:max_tokens]
return tokenizer.decode(input_ids, skip_special_tokens=True)
# Özetleme işlemi
def summarize_json(json_file, selected_model, is_turkish):
model_cfg = MODEL_OPTIONS[selected_model]
model_id = model_cfg["model_id"]
max_tokens = model_cfg["max_tokens"]
max_len = model_cfg["max_length"]
min_len = model_cfg["min_length"]
summarizer = pipeline("summarization", model=model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
if is_turkish:
tr2en = pipeline("translation", model="Helsinki-NLP/opus-mt-tc-big-tr-en")
en2tr = pipeline("translation", model="Helsinki-NLP/opus-mt-tc-big-en-tr")
with open(json_file.name, "r", encoding="utf-8") as f:
data = json.load(f)
summary_output = {}
for item in target_items:
raw = data.get(item, "").strip()
if not raw:
summary_output[item] = "EMPTY"
continue
if is_turkish:
raw = tr2en(raw)[0]["translation_text"]
trimmed = safe_trim(raw, tokenizer, max_tokens)
eng_summary = summarizer(trimmed, max_length=max_len, min_length=min_len, do_sample=False)[0]["summary_text"]
final = en2tr(eng_summary)[0]["translation_text"] if is_turkish else eng_summary
# Şirket adı düzeltmesi burada uygulanır
summary_output[item] = fix_company_names(final)
output_path = "summary_output.json"
with open(output_path, "w", encoding="utf-8") as f:
json.dump({"summary": summary_output}, f, indent=2)
readable_text = f" Summary generated by: {selected_model}\n\n"
readable_text += "\n\n".join([f"### {k.upper()}\n{v}" for k, v in summary_output.items()])
return output_path, readable_text
# Gradio Arayüz
with gr.Blocks(title="10-K Summary Generator") as demo:
gr.HTML("<h2 style='text-align: center;'> 10-K Financial Report Summarizer</h2>")
with gr.Row():
model_selector = gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), value="BART (10k-BART)", label="Select Model")
is_turkish = gr.Checkbox(label="Report is in Turkish 🇹🇷", value=False)
with gr.Row():
input_file = gr.File(label="📤 Upload 10-K JSON", file_types=[".json"])
output_file = gr.File(label="📥 Download Summary JSON")
with gr.Row():
summarize_btn = gr.Button(" Generate Summary")
with gr.Accordion(" Show Summary Text", open=False):
summary_text = gr.Textbox(label="", lines=25, show_label=False)
summarize_btn.click(
fn=summarize_json,
inputs=[input_file, model_selector, is_turkish],
outputs=[output_file, summary_text]
)
demo.launch()