File size: 8,830 Bytes
64b5d29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# src/data_management/storage.py (TÜM SABİTLERİ İÇEREN DOĞRU TAM KOD)

import pandas as pd
from pathlib import Path
import logging
import uuid
from datetime import datetime
import networkx as nx
import pickle
import string

# Temel veri klasörünün yolu
DATA_PATH = Path("data/processed_data")
# NetworkX graf dosyalarının yolu
NETWORK_PATH = Path("output/networks")

# --- TÜM GEREKLİ SABİT TANIMLARI ---
FREQUENCY_FILENAME = "analysis_concept_frequencies"
SIMILARITY_FILENAME = "analysis_concept_similarities"
NETWORK_ANALYSIS_FILENAME = "analysis_network_results"
GRAPH_FILENAME = "concept_network"
EMBEDDINGS_FILENAME = "concept_embeddings"
# ------------------------------------

# DataFrame sütun isimleri
DOC_COLUMNS = ['doc_id', 'filepath', 'publication_date', 'status', 'processed_text_path']
CONCEPT_COLUMNS = ['concept_id', 'name', 'aliases']
MENTION_COLUMNS = ['mention_id', 'doc_id', 'concept_id', 'context_snippet', 'start_char', 'end_char']
RELATIONSHIP_COLUMNS = ['relationship_id', 'source_concept_id', 'target_concept_id', 'type', 'mention_id', 'doc_id', 'sentence']
NETWORK_ANALYSIS_COLUMNS = ['concept_id', 'name', 'degree_centrality', 'betweenness_centrality', 'eigenvector_centrality', 'community_id']

# Logging ayarları
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# --- DataFrame Yükleme/Kaydetme (Değişiklik yok) ---
def load_dataframe(filename: str, columns: list) -> pd.DataFrame:
    filepath = DATA_PATH / f"{filename}.parquet"
    if filepath.exists():
        try:
            df = pd.read_parquet(filepath)
            logging.info(f"'{filepath}' başarıyla yüklendi.")
            if columns: # Check columns only if a list is provided
                for col in columns:
                    if col not in df.columns:
                        logging.warning(f"'{filepath}' dosyasında '{col}' sütunu eksik. Ekleniyor...")
                        df[col] = None
            return df
        except Exception as e:
            logging.error(f"'{filepath}' yüklenirken hata oluştu: {e}")
            return pd.DataFrame(columns=columns if columns else None)
    else:
        logging.info(f"'{filepath}' bulunamadı. Boş DataFrame oluşturuluyor.")
        return pd.DataFrame(columns=columns if columns else None)

def save_dataframe(df: pd.DataFrame, filename: str):
    DATA_PATH.mkdir(parents=True, exist_ok=True)
    filepath = DATA_PATH / f"{filename}.parquet"
    try:
        for col in df.select_dtypes(include=['object']).columns:
             if df[col].map(type).isin([list, dict, datetime, pd.Timestamp]).any(): continue
             df[col] = df[col].where(pd.notnull(df[col]), None)
             try: df[col] = df[col].astype(pd.StringDtype())
             except TypeError: logging.debug(f"Sütun '{col}' StringDtype'a çevrilemedi, orijinal tip korunuyor.")
        df.to_parquet(filepath, index=False)
        logging.info(f"DataFrame başarıyla '{filepath}' olarak kaydedildi.")
    except Exception as e:
        logging.error(f"DataFrame '{filepath}' olarak kaydedilirken hata oluştu: {e}")

# --- Doküman Yönetimi (Değişiklik yok) ---
def add_document(filepath_str: str, publication_date) -> str | None:
    documents_df = load_dataframe('documents', DOC_COLUMNS)
    filepath_str = str(Path(filepath_str).resolve())
    existing_doc = documents_df[documents_df['filepath'] == filepath_str]
    if not existing_doc.empty:
        existing_doc_id = existing_doc['doc_id'].iloc[0]
        logging.warning(f"Doküman zaten kayıtlı: {filepath_str} (ID: {existing_doc_id})")
        return str(existing_doc_id)
    new_doc_id = str(uuid.uuid4())
    try: pub_date_obj = pd.to_datetime(publication_date).date()
    except ValueError: logging.error(f"Geçersiz tarih formatı: {publication_date}. None olarak kaydedilecek."); pub_date_obj = None
    new_document_data = {'doc_id': new_doc_id, 'filepath': filepath_str, 'publication_date': pub_date_obj, 'status': 'added', 'processed_text_path': None}
    new_row_df = pd.DataFrame([new_document_data])
    if pub_date_obj is not None: new_row_df['publication_date'] = pd.to_datetime(new_row_df['publication_date']); dtype_dict = {'publication_date': 'datetime64[s]'}
    else: dtype_dict = {}
    documents_df = pd.concat([documents_df, new_row_df], ignore_index=True)
    for col, dtype in dtype_dict.items():
        try: documents_df[col] = documents_df[col].astype(dtype)
        except TypeError: logging.warning(f"Sütun '{col}' tipi '{dtype}' olarak ayarlanamadı.")
    save_dataframe(documents_df, 'documents')
    logging.info(f"Yeni doküman eklendi: {filepath_str} (ID: {new_doc_id})")
    return new_doc_id

def update_document_status(doc_id: str, new_status: str, text_path: str | None = None):
    docs_df = load_dataframe('documents', DOC_COLUMNS)
    doc_index = docs_df[docs_df['doc_id'] == doc_id].index
    if not doc_index.empty:
        idx = doc_index[0]
        docs_df.loc[idx, 'status'] = new_status
        if text_path: docs_df.loc[idx, 'processed_text_path'] = text_path
        save_dataframe(docs_df, 'documents')
        logging.info(f"Doküman durumu güncellendi: ID {doc_id} -> {new_status}")
    else: logging.warning(f"Durumu güncellenecek doküman bulunamadı: ID {doc_id}")

# --- Konsept, Mention, İlişki Yönetimi (Değişiklik yok) ---
def add_concept(raw_name: str) -> str | None:
    concepts_df = load_dataframe('concepts', CONCEPT_COLUMNS)
    name = raw_name.lower().strip().strip(string.punctuation + string.whitespace)
    if name.endswith("'s"): name = name[:-2].strip()
    name = ' '.join(name.split())
    if not name or len(name) < 2: return None
    existing_concept = concepts_df[concepts_df['name'] == name]
    if not existing_concept.empty: return str(existing_concept['concept_id'].iloc[0])
    new_concept_id = str(uuid.uuid4()); new_concept_data = {'concept_id': new_concept_id, 'name': name, 'aliases': [raw_name]}
    new_row_df = pd.DataFrame([new_concept_data]); concepts_df = pd.concat([concepts_df, new_row_df], ignore_index=True)
    concepts_df['aliases'] = concepts_df['aliases'].astype('object')
    save_dataframe(concepts_df, 'concepts')
    logging.info(f"Yeni konsept eklendi: '{name}' (Orijinal: '{raw_name}', ID: {new_concept_id})")
    return new_concept_id

def add_mention(doc_id: str, concept_id: str, context: str, start: int, end: int) -> str | None:
    if concept_id is None: return None
    mentions_df = load_dataframe('mentions', MENTION_COLUMNS); new_mention_id = str(uuid.uuid4())
    new_mention_data = {'mention_id': new_mention_id, 'doc_id': doc_id, 'concept_id': concept_id, 'context_snippet': context[:500], 'start_char': start, 'end_char': end}
    new_row_df = pd.DataFrame([new_mention_data]); mentions_df = pd.concat([mentions_df, new_row_df], ignore_index=True)
    save_dataframe(mentions_df, 'mentions'); return new_mention_id

def add_relationship(source_concept_id: str, target_concept_id: str, rel_type: str, mention_id: str | None, doc_id: str, sentence: str) -> str | None:
    if source_concept_id is None or target_concept_id is None: return None
    relationships_df = load_dataframe('relationships', RELATIONSHIP_COLUMNS); new_relationship_id = str(uuid.uuid4())
    new_relationship_data = {'relationship_id': new_relationship_id, 'source_concept_id': source_concept_id, 'target_concept_id': target_concept_id, 'type': rel_type, 'mention_id': mention_id, 'doc_id': doc_id, 'sentence': sentence[:500]}
    new_row_df = pd.DataFrame([new_relationship_data]); relationships_df = pd.concat([relationships_df, new_row_df], ignore_index=True)
    save_dataframe(relationships_df, 'relationships'); return new_relationship_id

# --- NetworkX Graf Yükleme/Kaydetme (Değişiklik yok) ---
def save_network(graph: nx.Graph, filename: str):
    NETWORK_PATH.mkdir(parents=True, exist_ok=True); filepath = NETWORK_PATH / f"{filename}.pkl"
    try:
        with open(filepath, 'wb') as f: pickle.dump(graph, f)
        logging.info(f"NetworkX grafı başarıyla '{filepath}' olarak kaydedildi.")
    except Exception as e: logging.error(f"Graf '{filepath}' olarak kaydedilirken hata: {e}")

def load_network(filename: str) -> nx.Graph | None:
    filepath = NETWORK_PATH / f"{filename}.pkl"
    if filepath.exists():
        try:
            with open(filepath, 'rb') as f: graph = pickle.load(f)
            logging.info(f"NetworkX grafı '{filepath}' başarıyla yüklendi.")
            return graph
        except Exception as e: logging.error(f"Graf '{filepath}' yüklenirken hata: {e}"); return nx.Graph()
    else: logging.warning(f"Graf dosyası bulunamadı: '{filepath}'"); return nx.Graph()