| import sys | |
| import math | |
| class TextProcessor: | |
| def __init__(self, texto): | |
| self.texto = texto | |
| def entropy(self): | |
| simbolos = {} | |
| total_caracteres = len(self.texto) | |
| for caracter in self.texto: | |
| simbolos[caracter] = simbolos.get(caracter, 0) + 1 | |
| entropia = 0 | |
| for count in simbolos.values(): | |
| probabilidad = count / total_caracteres | |
| entropia -= probabilidad * math.log2(probabilidad) | |
| return simbolos, entropia | |
| def common_string(self, cadena1, cadena2): | |
| longitud1 = len(cadena1) | |
| longitud2 = len(cadena2) | |
| comun = '' | |
| subcadenas_comunes = [] | |
| for i in range(longitud1): | |
| for j in range(longitud2): | |
| k = 0 | |
| while (i+k < longitud1 and j+k < longitud2 and cadena1[i+k] == cadena2[j+k]): | |
| k += 1 | |
| if k > 0: | |
| subcadenas_comunes.append(cadena1[i:i+k]) | |
| if subcadenas_comunes: | |
| comun = max(subcadenas_comunes, key=len) | |
| return comun | |
| def magic_split(self): | |
| unique_symbols = set(self.texto) | |
| symbol_distances = {} | |
| for symbol in unique_symbols: | |
| indices = [i for i, char in enumerate(self.texto) if char == symbol] | |
| if len(indices) > 1: | |
| distances = [indices[i + 1] - indices[i] for i in range(len(indices) - 1)] | |
| symbol_distances[symbol] = distances | |
| variation = {symbol: max(distances) - min(distances) for symbol, distances in symbol_distances.items() if distances} | |
| mins = {} | |
| for v in variation: | |
| if variation[v]!=0 and variation[v]!=1: | |
| mins[v] = variation[v] | |
| best_symbol = min(mins, key=mins.get) | |
| return best_symbol | |
| def rotate_string(self, string, n): | |
| indice = n % len(string) | |
| string_rotado = string[indice:] + string[:indice] | |
| return string_rotado | |
| def rotate_compare(self, tokiA, tokiB): | |
| if tokiA >= tokiB: | |
| tokA = tokiA | |
| tokB = tokiB | |
| ltokA = len(tokA) | |
| else: | |
| tokA = tokiB | |
| tokB = tokiA | |
| ltokA = len(tokB) | |
| i = 0 | |
| rotations = {} | |
| while i < ltokA: | |
| tokrotated = self.rotate_string(tokA, i) | |
| rotations[str(i)] = self.common_string(tokrotated, tokB) | |
| i += 1 | |
| best_r = "" | |
| for x in rotations: | |
| lb = len(best_r) | |
| rot = rotations[x] | |
| lrot = len(rot) | |
| if lrot > 1 and lrot < ltokA and lrot > lb: | |
| best_r = rot | |
| return best_r | |
| def get_subTokens(self, spl): | |
| sub_tokens = self.texto.split(spl) | |
| toks = [] | |
| for tok in sub_tokens: | |
| for tok2 in sub_tokens: | |
| if tok != tok2: | |
| toks.append(self.rotate_compare(tok, tok2)) | |
| return list(set(toks)) | |
| def tokenize(self, spliter_optimo): | |
| tokens = self.get_subTokens(spliter_optimo) | |
| tokenized_sentence = {} | |
| chunk = self.texto.split(spliter_optimo) | |
| for txt in chunk: | |
| best_split = "" | |
| for tok in tokens: | |
| if tok != "": | |
| lt = len(tok) | |
| lb = len(best_split) | |
| spltxt = txt.split(tok) | |
| if len(spltxt) > 1: | |
| l0 = len(spltxt[0]) | |
| l1 = len(spltxt[1]) | |
| if lt < len(txt) and lt > lb: | |
| best_split = tok | |
| tokenized_sentence[txt] = " " + spltxt[0] + "-" + tok + "-" + spltxt[1] | |
| return tokenized_sentence | |