# src/extraction/extractor.py (AttributeError DÜZELTİLMİŞ TAM KOD) import spacy from pathlib import Path import logging import itertools import re import string # Yerel modüllerimizi içe aktaralım from src.data_management import storage from src.data_management import loaders # extract_text_from_pdf için logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # --- spaCy Model Yükleme --- nlp = None STOP_WORDS = set() try: nlp = spacy.load("en_core_web_lg") logging.info("spaCy 'en_core_web_lg' modeli başarıyla yüklendi.") STOP_WORDS = nlp.Defaults.stop_words except OSError: logging.error("spaCy 'en_core_web_lg' modeli bulunamadı. Lütfen indirin: python -m spacy download en_core_web_lg") # --- Konsept Belirleme Kriterleri (Aynı kaldı) --- TRUSTED_ENTITY_LABELS = {"PRODUCT", "ORG", "WORK_OF_ART"} OTHER_ENTITY_LABELS = {"PERSON", "EVENT", "LAW", "NORP", "FAC", "GPE", "LOC"} NOUN_CHUNK_PATTERNS = re.compile(r".*\b(learning|network|model|algorithm|system|technique|approach|agent|layer|architecture|transformer|attention)\b$", re.IGNORECASE) MIN_CONCEPT_WORDS = 1 MAX_CONCEPT_WORDS = 6 AI_KEYWORDS = {"artificial intelligence", "machine learning", "deep learning", "neural network", "reinforcement learning", "transformer", "llm", "large language model", "computer vision", "natural language processing", "algorithm", "model", "gpt", "bert", "agent", "attention", "supervised", "unsupervised", "classification", "regression", "clustering"} # --- İlişki Çıkarımı için Fiiller ve Desenler --- RELATION_VERBS = { "use": "USES", "utilize": "USES", "apply": "USES", "employ": "USES", "improve": "IMPROVES", "enhance": "IMPROVES", "extend": "IMPROVES", "outperform": "IMPROVES", "base on": "BASED_ON", "rely on": "BASED_ON", "compare": "COMPARES_TO", "relate": "RELATED_TO", "associate": "RELATED_TO", "link": "RELATED_TO", "propose": "PROPOSES", "introduce": "PROPOSES", "develop": "PROPOSES", } def normalize_and_validate_concept(text: str, is_entity: bool = False, entity_label: str = "") -> str | None: """ Verilen metni temizler, doğrular... """ cleaned_text = text.strip() word_count = len(cleaned_text.split()) if not (MIN_CONCEPT_WORDS <= word_count <= MAX_CONCEPT_WORDS): return None if cleaned_text and all(word.lower() in STOP_WORDS for word in re.findall(r'\b\w+\b', cleaned_text)): return None if cleaned_text.isdigit() or all(c in string.punctuation for c in cleaned_text): return None generic_phrases = {"this approach", "these models", "this technique", "this system", "the model", "the algorithm", "the method", "the approach", "the system", "the technique", "our model", "our approach"} if cleaned_text.lower() in generic_phrases: return None return cleaned_text def find_verb_relation(token1: spacy.tokens.Token, token2: spacy.tokens.Token) -> tuple[str, str] | None: """ İki token arasındaki dependency path'e bakarak fiil ilişkisi bulur. """ common_ancestor = None ancestors1 = list(token1.ancestors) ancestors2 = list(token2.ancestors) for t in reversed(ancestors1): if t in ancestors2: common_ancestor = t break if not common_ancestor: return None verb1 = None; head = token1 while head != common_ancestor: if head.pos_ == "VERB": verb1 = head; break head = head.head verb2 = None; head = token2 while head != common_ancestor: if head.pos_ == "VERB": verb2 = head; break head = head.head verb_token = None if common_ancestor.pos_ == "VERB": verb_token = common_ancestor elif verb1 and verb1 == verb2: verb_token = verb1 # elif verb1: verb_token = verb1 # Tek taraflı fiilleri şimdilik yoksayalım # elif verb2: verb_token = verb2 elif common_ancestor.head.pos_ == "VERB": verb_token = common_ancestor.head if verb_token: verb_lemma = verb_token.lemma_ # *** HATA DÜZELTME: Bu satırı geçici olarak kaldırıyoruz/yorum yapıyoruz *** # if verb_token.is_aux or verb_token.is_stop: # return None # ********************************************************************** for verb, rel_type in RELATION_VERBS.items(): if verb_lemma == verb or verb_lemma in verb.split(): logging.debug(f"Fiil ilişkisi bulundu: {token1.text}... {verb_lemma} ({rel_type}) ...{token2.text}") return rel_type, verb_lemma return None def extract_entities_and_relations(text: str, doc_id: str): """ Metinden konseptleri, mention'ları ve İYİLEŞTİRİLMİŞ ilişkileri çıkarır. """ if not nlp: raise RuntimeError("spaCy modeli yüklenemedi.") spacy_doc = nlp(text) potential_concepts = {}; mentions_in_doc = []; valid_mentions = {} processed_spans = set(); added_relations = set() # 1. Adayları Bul candidates = [] for ent in spacy_doc.ents: if ent.label_ in TRUSTED_ENTITY_LABELS or ent.label_ in OTHER_ENTITY_LABELS: candidates.append({"span": ent, "is_entity": True, "label": ent.label_}) for chunk in spacy_doc.noun_chunks: is_covered = any(ent_data["span"].start_char <= chunk.start_char and ent_data["span"].end_char >= chunk.end_char for ent_data in candidates if ent_data["is_entity"]) if not is_covered: candidates.append({"span": chunk, "is_entity": False, "label": ""}) # 2. Adayları Filtrele, Normalleştir ve Kaydet for data in candidates: span = data["span"]; if span in processed_spans: continue validated_text = normalize_and_validate_concept(span.text, data["is_entity"], data["label"]) if not validated_text: processed_spans.add(span); continue concept_lemma = span.lemma_.lower().strip() if span.lemma_ else validated_text.lower() is_concept = False if data["is_entity"] and data["label"] in TRUSTED_ENTITY_LABELS: is_concept = True elif NOUN_CHUNK_PATTERNS.match(validated_text): is_concept = True elif any(keyword in concept_lemma.split() or keyword in validated_text.lower().split() for keyword in AI_KEYWORDS): is_concept = True elif validated_text.isupper() and len(validated_text) > 1 and len(validated_text) < 6: is_concept = True if is_concept: concept_id = storage.add_concept(validated_text) if concept_id: mention_id = storage.add_mention( doc_id=doc_id, concept_id=concept_id, context=span.sent.text, start=span.start_char, end=span.end_char ) if mention_id: mention_data = { "mention_id": mention_id, "concept_id": concept_id, "start_char": span.start_char, "end_char": span.end_char, "sentence": span.sent, "root_token": span.root } mentions_in_doc.append(mention_data); valid_mentions[mention_id] = mention_data processed_spans.add(span) # 3. İlişkileri Çıkar for sentence in spacy_doc.sents: mentions_in_sentence = [m for m in mentions_in_doc if m["sentence"] == sentence] if len(mentions_in_sentence) >= 2: for m1_data, m2_data in itertools.combinations(mentions_in_sentence, 2): c1_id = m1_data["concept_id"]; c2_id = m2_data["concept_id"] if c1_id == c2_id: continue rel_pair = tuple(sorted((c1_id, c2_id))) if rel_pair in added_relations: continue relation_found = False relation_info = find_verb_relation(m1_data["root_token"], m2_data["root_token"]) if relation_info: rel_type, verb = relation_info storage.add_relationship( source_concept_id=c1_id, target_concept_id=c2_id, rel_type=rel_type, mention_id=m1_data["mention_id"], doc_id=doc_id, sentence=sentence.text ) relation_found = True; added_relations.add(rel_pair) if not relation_found: storage.add_relationship( source_concept_id=c1_id, target_concept_id=c2_id, rel_type="RELATED_TO", mention_id=m1_data["mention_id"], doc_id=doc_id, sentence=sentence.text ) added_relations.add(rel_pair) def process_documents_for_extraction(): """ Dokümanları işler ve durumu günceller... (Öncekiyle aynı) """ if not nlp: raise RuntimeError("spaCy modeli yüklenemedi.") logging.info("Gelişmiş bilgi çıkarımı için dokümanlar işleniyor...") documents_df = storage.load_dataframe('documents', storage.DOC_COLUMNS) docs_to_process = documents_df[documents_df['status'] == 'added'] if docs_to_process.empty: logging.info("Durumu 'added' olan ve işlenecek doküman bulunamadı.") return processed_count = 0; failed_count = 0 for index, doc_row in docs_to_process.iterrows(): doc_id = doc_row['doc_id']; filepath = Path(doc_row['filepath']) logging.info(f"İşleniyor: {filepath.name} (ID: {doc_id})") text = loaders.extract_text_from_pdf(filepath) if text: try: extract_entities_and_relations(text, doc_id) storage.update_document_status(doc_id, 'processed_v3') # Yeni versiyon durumu processed_count += 1 except Exception as e: logging.exception(f"'{filepath.name}' işlenirken BEKLENMEYEN HATA oluştu: {e}") storage.update_document_status(doc_id, 'extraction_failed_v3') failed_count += 1 else: logging.warning(f"Metin çıkarılamadı: {filepath.name}") storage.update_document_status(doc_id, 'text_extraction_failed') failed_count += 1 logging.info(f"Gelişmiş bilgi çıkarımı tamamlandı. Başarılı: {processed_count}, Başarısız: {failed_count}")