Spaces:

zaferisikli
/

10ksummary

Running

App Files Files Community

10ksummary / app.py

zaferisikli

Update app.py

c4f5bfa verified 3 months ago

raw

history blame contribute delete

3.72 kB

	import gradio as gr
	import json
	from transformers import pipeline, AutoTokenizer

	# Model seçenekleri
	MODEL_OPTIONS = {
	"BART (10k-BART)": {
	"model_id": "zaferisikli/10k-bart-summary",
	"max_tokens": 1020,
	"max_length": 250,
	"min_length": 70
	},
	"PEGASUS (10k-Pegasus)": {
	"model_id": "zaferisikli/10k-pegasus-summary",
	"max_tokens": 512,
	"max_length": 256,
	"min_length": 64
	}
	}

	# Hedeflenen bölümler
	target_items = ["item_1", "item_1A", "item_7", "item_8", "item_9A"]

	# Yanlış şirket adlarını düzeltme fonksiyonu
	def fix_company_names(text, correct_name="Quotient Technology Inc."):
	wrong_names = [
	"Bridgeline", "Guidewire", "AppFolio", "GTY",
	"GTY Technology Holdings Inc."
	]
	for wrong in wrong_names:
	text = text.replace(wrong, correct_name)
	return text

	# Token sayısına göre metni kırpma
	def safe_trim(text, tokenizer, max_tokens):
	input_ids = tokenizer.encode(text, truncation=False)
	input_ids = input_ids[:max_tokens]
	return tokenizer.decode(input_ids, skip_special_tokens=True)

	# Özetleme işlemi
	def summarize_json(json_file, selected_model, is_turkish):
	model_cfg = MODEL_OPTIONS[selected_model]
	model_id = model_cfg["model_id"]
	max_tokens = model_cfg["max_tokens"]
	max_len = model_cfg["max_length"]
	min_len = model_cfg["min_length"]

	summarizer = pipeline("summarization", model=model_id)
	tokenizer = AutoTokenizer.from_pretrained(model_id)

	if is_turkish:
	tr2en = pipeline("translation", model="Helsinki-NLP/opus-mt-tc-big-tr-en")
	en2tr = pipeline("translation", model="Helsinki-NLP/opus-mt-tc-big-en-tr")

	with open(json_file.name, "r", encoding="utf-8") as f:
	data = json.load(f)

	summary_output = {}
	for item in target_items:
	raw = data.get(item, "").strip()
	if not raw:
	summary_output[item] = "EMPTY"
	continue

	if is_turkish:
	raw = tr2en(raw)[0]["translation_text"]

	trimmed = safe_trim(raw, tokenizer, max_tokens)
	eng_summary = summarizer(trimmed, max_length=max_len, min_length=min_len, do_sample=False)[0]["summary_text"]

	final = en2tr(eng_summary)[0]["translation_text"] if is_turkish else eng_summary

	# Şirket adı düzeltmesi burada uygulanır
	summary_output[item] = fix_company_names(final)

	output_path = "summary_output.json"
	with open(output_path, "w", encoding="utf-8") as f:
	json.dump({"summary": summary_output}, f, indent=2)

	readable_text = f" Summary generated by: {selected_model}\n\n"
	readable_text += "\n\n".join([f"### {k.upper()}\n{v}" for k, v in summary_output.items()])
	return output_path, readable_text

	# Gradio Arayüz
	with gr.Blocks(title="10-K Summary Generator") as demo:
	gr.HTML("<h2 style='text-align: center;'> 10-K Financial Report Summarizer</h2>")

	with gr.Row():
	model_selector = gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), value="BART (10k-BART)", label="Select Model")
	is_turkish = gr.Checkbox(label="Report is in Turkish 🇹🇷", value=False)

	with gr.Row():
	input_file = gr.File(label="📤 Upload 10-K JSON", file_types=[".json"])
	output_file = gr.File(label="📥 Download Summary JSON")

	with gr.Row():
	summarize_btn = gr.Button(" Generate Summary")

	with gr.Accordion(" Show Summary Text", open=False):
	summary_text = gr.Textbox(label="", lines=25, show_label=False)

	summarize_btn.click(
	fn=summarize_json,
	inputs=[input_file, model_selector, is_turkish],
	outputs=[output_file, summary_text]
	)

	demo.launch()