import gradio as gr import rapidfuzz from datasets import load_dataset from sentence_transformers import SentenceTransformer, util dataset = load_dataset("furkanunluturk/turkce-atasozleri")["train"] proverbs = dataset["text"] definitions = [definition[0]["text"] for definition in dataset["all_definitions"]] combined_definitions = [ " ".join(definition["text"] for definition in item["all_definitions"]) for item in dataset ] model = SentenceTransformer("emrecan/bert-base-turkish-cased-mean-nli-stsb-tr") definition_embeddings = model.encode(combined_definitions, convert_to_tensor=True) proverb_embeddings = model.encode(proverbs, convert_to_tensor=True) def embedding_search(input_text, embeddings, top_x): input_embedding = model.encode(input_text, convert_to_tensor=True) results = util.semantic_search(input_embedding, embeddings, top_k=top_x)[0] return [ ( proverbs[result["corpus_id"]], definitions[result["corpus_id"]], result["score"], ) for result in results ] def fuzzy_search(input_text, corpus, top_x): matches = rapidfuzz.process.extract( query=input_text, choices=corpus, limit=top_x, processor=lambda text: rapidfuzz.utils.default_process(text.replace("I", "ı")), ) return [(proverbs[match[2]], definitions[match[2]], match[1]) for match in matches] # Combined function to return all types of recommendations def recommend_proverbs(input_text, top_x): return ( embedding_search(input_text, definition_embeddings, top_x), embedding_search(input_text, proverb_embeddings, top_x), fuzzy_search(input_text, definitions, top_x), fuzzy_search(input_text, proverbs, top_x), ) # Format results for display def format_results(results): return [ [proverb, definition, f"{score:.4f}" if isinstance(score, float) else score] for proverb, definition, score in results ] def search_proverbs(input_text, top_x): ( embedding_def_results, embedding_prov_results, fuzzy_def_results, fuzzy_prov_results, ) = recommend_proverbs(input_text, top_x) return ( format_results(embedding_def_results), format_results(fuzzy_def_results), format_results(embedding_prov_results), format_results(fuzzy_prov_results), ) # Define Gradio app inputs and outputs inputs = [ gr.Textbox(label="Input Text", placeholder="Enter a phrase or sentence..."), gr.Slider(label="Top X Results", minimum=1, maximum=10, step=1, value=5), ] outputs = [ gr.Dataframe( headers=["Proverb", "DEFINITION", "Score"], label="Embedding-Based Search (Definition)", wrap=True, ), gr.Dataframe( headers=["Proverb", "DEFINITION", "WRatio"], label="Fuzzy Search (Definition)", wrap=True, ), gr.Dataframe( headers=["PROVERB", "Definition", "Score"], label="Embedding-Based Search (Proverb)", wrap=True, ), gr.Dataframe( headers=["PROVERB", "Definition", "WRatio"], label="Fuzzy Search (Proverb)", wrap=True, ), ] # Gradio app initialization app = gr.Interface( fn=search_proverbs, inputs=inputs, outputs=outputs, title="Turkish Proverb Recommender", description=( "Compare recommendations using embedding-based similarity and fuzzy search. " "Search proverbs and definitions based on semantic and literal similarities." ), ) if __name__ == "__main__": app.launch()