grady

Running

App Files Files Community

grady / utils.py

bstraehle

Update utils.py

d0d0aee verified 20 days ago

raw

history blame contribute delete

2.1 kB

	import os
	import pandas as pd
	from docx import Document
	from pptx import Presentation

	def get_questions(file_path, level):
	df = pd.read_json(file_path, lines=True)

	if level > 0:
	df = df[df["Level"] == level]

	result=[]

	for index, row in df.iterrows():
	result.append([row["Question"], row["Level"], row["Final answer"], row["file_name"]])

	return result

	def is_ext(file_path, ext):
	return os.path.splitext(file_path)[1].lower() == ext.lower()

	def read_file_json(file_path):
	df = None

	if is_ext(file_path, ".csv"):
	df = pd.read_csv(file_path)
	elif is_ext(file_path, ".xls") or is_ext(file_path, ".xlsx"):
	df = pd.read_excel(file_path)
	elif is_ext(file_path, ".json") or is_ext(file_path, ".jsonl"):
	df = pd.read_json(file_path)

	return "" if df is None else df.to_json()

	def read_docx_text(file_path):
	doc = Document(file_path)

	text = []

	for block in doc.element.body:
	if block.tag.endswith("p"):
	for paragraph in doc.paragraphs:
	if paragraph._element == block:
	if paragraph.style.name.startswith("Heading"):
	text.append("\n" + paragraph.text + "\n")
	elif paragraph.text:
	text.append(paragraph.text)
	elif block.tag.endswith("tbl"):
	for table in doc.tables:
	if table._element == block:
	for row in table.rows:
	row_text = []
	for cell in row.cells:
	row_text.append(cell.text.strip())
	text.append(" \| ".join(row_text))

	return "\n".join(text)

	def read_pptx_text(file_path):
	prs = Presentation(file_path)

	text = []

	for slide in prs.slides:
	slide_text = []
	for shape in slide.shapes:
	if hasattr(shape, "text"):
	slide_text.append(shape.text)
	text.append("\n".join(slide_text))

	return "\n\n".join(text)