File size: 6,702 Bytes
9c60983 8b49747 ed89188 9c60983 8b49747 9c60983 8b49747 9c60983 8b49747 9c60983 8b49747 9c60983 34768ec 8b49747 9c60983 34768ec 8b49747 9c60983 8b49747 9c60983 9a23d03 8b49747 9a23d03 b524a09 9a23d03 043a1be 9a23d03 043a1be 9a23d03 8b49747 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 |
---
library_name: transformers
tags:
- cryptology
- cipher
datasets:
- agentlans/high-quality-english-sentences
language:
- en
base_model:
- google-t5/t5-base
license: apache-2.0
---
This project contains a text-to-text model designed to decrypt English text encoded using a substitution cipher.
In a substitution cipher, each letter in the plaintext is replaced by a corresponding, unique letter to form the ciphertext.
The model leverages statistical and linguistic properties of English to make educated guesses about the letter substitutions,
aiming to recover the original plaintext message.
This model is for monoalphabetic English substitution ciphers and it outputs the alphabet used in encoding.
Example:
Encoded text:
**Hd adcdcwda yod drdqyn zk zsa boiluozzu.**
Plain text:
**We remember the events of our childhood.**
Alphabet (output):
**rcme...wi.fl.sh.nvu.d.b.to**
Here 'r' is number 1 in the alphabet and that is why we use 'a' instead of 'r' in encoding.
Single Model Usage:
```py
#Load the model and tokenizer
cipher_text = "" #Encoded text here!
inputs = tokenizer(cipher_text, return_tensors="pt", padding=True, truncation=True, max_length=256).to(device)
outputs = model.generate(inputs["input_ids"], max_length=256)
decoded_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
```
Full Pipeline Usage:
```py
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from string import ascii_lowercase
import Levenshtein
import random
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("Cipher-AI/Substitution-Cipher-Alphabet-Eng")
alphabet_model = AutoModelForSeq2SeqLM.from_pretrained("Cipher-AI/Substitution-Cipher-Alphabet-Eng").to(device)
correction_model = AutoModelForSeq2SeqLM.from_pretrained("Cipher-AI/AutoCorrect-EN-v2").to(device)
def similarity_percentage(s1, s2):
distance = Levenshtein.distance(s1, s2)
max_len = max(len(s1), len(s2))
similarity = (1 - distance / max_len) * 100
return similarity
def decode(cipher_text, key):
decipher_map = {ascii_lowercase[i]: j for i, j in enumerate(key[:26])}
decipher_map.update({ascii_lowercase[i].upper(): j.upper() for i, j in enumerate(key[:26])})
ans = ''.join(map(lambda x: decipher_map[x] if x in decipher_map else x, cipher_text))
return ans
def model_pass(model, input, max_length=256):
inputs = tokenizer(input, return_tensors="pt", padding=True, truncation=True, max_length=256).to(device)
outputs = model.generate(inputs["input_ids"], max_length=max_length)
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
return result
def decipher(cipher_text, key) -> str:
decipher_map = {ascii_lowercase[i]: j for i, j in enumerate(key[0])}
decipher_map.update({ascii_lowercase[i].upper(): j.upper() for i, j in enumerate(key[0])})
result = ''.join(map(lambda x: decipher_map[x] if x in decipher_map else x, cipher_text[0]))
return result
def cipher(plain_text) -> tuple[str, list]:
alphabet_map = list(ascii_lowercase)
random.shuffle(alphabet_map)
alphabet_map = {i : j for i, j in zip(ascii_lowercase, alphabet_map)}
alphabet_map.update({i.upper() : j.upper() for i, j in alphabet_map.items()})
cipher_text = ''.join(map(lambda x: alphabet_map[x] if x in alphabet_map else x, plain_text))
return cipher_text, alphabet_map
def correct_text(cipher_text, model_output):
cipher_text = cipher_text.split(' ')
model_output = model_output.split(' ')
letter_map = {i: {j: 0 for j in ascii_lowercase} for i in ascii_lowercase}
# Levenstein distance for lenghts of words
n = len(cipher_text)
m = len(model_output)
i = 0
j = 0
dp = [[0 for _ in range(m + 1)] for _ in range(n + 1)]
for i in range(n + 1):
dp[i][0] = i
for j in range(m + 1):
dp[0][j] = j
for i in range(1, n + 1):
for j in range(1, m + 1):
if len(cipher_text[i - 1]) == len(model_output[j - 1]):
dp[i][j] = dp[i - 1][j - 1]
else:
dp[i][j] = min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1]) + 1
i = n
j = m
while i > 0 and j > 0:
before = min([(0, dp[i - 1][j - 1]), (1, dp[i - 1][j]), (2, dp[i][j - 1])], key=lambda x: x[1])
match before[0]:
case 0:
if dp[i - 1][j - 1] == dp[i][j]:
# If the same we add them to letter map
cipher = cipher_text[i-1]
model_o = model_output[j-1]
for c_letter, m_letter in zip(cipher.lower(), model_o.lower()):
if c_letter in letter_map and m_letter in letter_map[c_letter]:
letter_map[c_letter][m_letter] += 1
i = i - 1
j = j - 1
case 1:
i = i - 1
case 2:
j = j - 1
for letter in ascii_lowercase:
letter_sum = sum(letter_map[letter].values())
if letter_sum == 0:
# That letter wasn't in the text
letter_map[letter] = None
continue
# Sorted from most accuring to least
letter_map[letter] = [(k, v / letter_sum) for k, v in sorted(letter_map[letter].items(), key=lambda item: item[1], reverse=True)]
change_map = {
i : None for i in ascii_lowercase
}
for i in range(len(ascii_lowercase)):
for letter in ascii_lowercase:
if letter_map[letter] is None:
continue # That letter wasn't in the text
# If None then it didn't get substituted earlier
map_letter = letter_map[letter][i][0]
if (letter_map[letter][i][1] > 0 and (change_map[map_letter] is None
or (change_map[map_letter][2] < letter_map[letter][i][1] and change_map[map_letter][1] >= i))):
change_map[map_letter] = (letter, i, letter_map[letter][i][1])
# Letter, iteration, percentage
change_map = {i[1][0]: i[0] for i in change_map.items() if i[1] is not None}
for letter in ascii_lowercase:
if letter not in change_map:
change_map[letter] = '.'
# Add uppercases
change_map.update(
{
i[0].upper() : i[1].upper() for i in change_map.items()
}
)
new_text = []
for cipher in cipher_text:
new_word = ""
for c_letter in cipher:
if c_letter in change_map:
new_word += change_map[c_letter]
else:
new_word += c_letter
new_text.append(new_word)
return ' '.join(new_text)
def crack_sub(cipher_text):
output = model_pass(alphabet_model, cipher_text, 26)
decoded = decode(cipher_text, output)
second_pass = model_pass(correction_model, decoded, len(decoded))
second_text = correct_text(cipher_text, second_pass)
third_pass = model_pass(correction_model, second_text, len(decoded))
return third_pass
"""
Use crack_sub() function to solve monoalphabetic substitution ciphers!
"""
``` |