authormist commited on
Commit
b3ae3a1
·
verified ·
1 Parent(s): 4873013

Upload model files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +11 -0
  2. .venv/.gitignore +1 -0
  3. .venv/bin/Activate.ps1 +248 -0
  4. .venv/bin/activate +76 -0
  5. .venv/bin/activate.csh +27 -0
  6. .venv/bin/activate.fish +69 -0
  7. .venv/bin/huggingface-cli +8 -0
  8. .venv/bin/normalizer +8 -0
  9. .venv/bin/pip +8 -0
  10. .venv/bin/pip3 +8 -0
  11. .venv/bin/pip3.13 +8 -0
  12. .venv/bin/python +0 -0
  13. .venv/bin/python3 +0 -0
  14. .venv/bin/python3.13 +0 -0
  15. .venv/bin/tqdm +8 -0
  16. .venv/lib/python3.13/site-packages/PyYAML-6.0.2.dist-info/INSTALLER +1 -0
  17. .venv/lib/python3.13/site-packages/PyYAML-6.0.2.dist-info/LICENSE +20 -0
  18. .venv/lib/python3.13/site-packages/PyYAML-6.0.2.dist-info/METADATA +46 -0
  19. .venv/lib/python3.13/site-packages/PyYAML-6.0.2.dist-info/RECORD +43 -0
  20. .venv/lib/python3.13/site-packages/PyYAML-6.0.2.dist-info/WHEEL +5 -0
  21. .venv/lib/python3.13/site-packages/PyYAML-6.0.2.dist-info/top_level.txt +2 -0
  22. .venv/lib/python3.13/site-packages/_yaml/__init__.py +33 -0
  23. .venv/lib/python3.13/site-packages/certifi-2025.1.31.dist-info/INSTALLER +1 -0
  24. .venv/lib/python3.13/site-packages/certifi-2025.1.31.dist-info/LICENSE +20 -0
  25. .venv/lib/python3.13/site-packages/certifi-2025.1.31.dist-info/METADATA +77 -0
  26. .venv/lib/python3.13/site-packages/certifi-2025.1.31.dist-info/RECORD +14 -0
  27. .venv/lib/python3.13/site-packages/certifi-2025.1.31.dist-info/WHEEL +5 -0
  28. .venv/lib/python3.13/site-packages/certifi-2025.1.31.dist-info/top_level.txt +1 -0
  29. .venv/lib/python3.13/site-packages/certifi/__init__.py +4 -0
  30. .venv/lib/python3.13/site-packages/certifi/__main__.py +12 -0
  31. .venv/lib/python3.13/site-packages/certifi/cacert.pem +0 -0
  32. .venv/lib/python3.13/site-packages/certifi/core.py +114 -0
  33. .venv/lib/python3.13/site-packages/certifi/py.typed +0 -0
  34. .venv/lib/python3.13/site-packages/charset_normalizer-3.4.1.dist-info/INSTALLER +1 -0
  35. .venv/lib/python3.13/site-packages/charset_normalizer-3.4.1.dist-info/LICENSE +21 -0
  36. .venv/lib/python3.13/site-packages/charset_normalizer-3.4.1.dist-info/METADATA +721 -0
  37. .venv/lib/python3.13/site-packages/charset_normalizer-3.4.1.dist-info/RECORD +35 -0
  38. .venv/lib/python3.13/site-packages/charset_normalizer-3.4.1.dist-info/WHEEL +5 -0
  39. .venv/lib/python3.13/site-packages/charset_normalizer-3.4.1.dist-info/entry_points.txt +2 -0
  40. .venv/lib/python3.13/site-packages/charset_normalizer-3.4.1.dist-info/top_level.txt +1 -0
  41. .venv/lib/python3.13/site-packages/charset_normalizer/__init__.py +48 -0
  42. .venv/lib/python3.13/site-packages/charset_normalizer/__main__.py +6 -0
  43. .venv/lib/python3.13/site-packages/charset_normalizer/api.py +668 -0
  44. .venv/lib/python3.13/site-packages/charset_normalizer/cd.py +395 -0
  45. .venv/lib/python3.13/site-packages/charset_normalizer/cli/__init__.py +8 -0
  46. .venv/lib/python3.13/site-packages/charset_normalizer/cli/__main__.py +321 -0
  47. .venv/lib/python3.13/site-packages/charset_normalizer/constant.py +1998 -0
  48. .venv/lib/python3.13/site-packages/charset_normalizer/legacy.py +66 -0
  49. .venv/lib/python3.13/site-packages/charset_normalizer/md.cpython-313-darwin.so +3 -0
  50. .venv/lib/python3.13/site-packages/charset_normalizer/md.py +630 -0
.gitattributes CHANGED
@@ -33,3 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ <<<<<<< HEAD
37
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ =======
39
+ >>>>>>> 4873013eddb1c2d779f664501b56e56d5e261341
40
+ .venv/lib/python3.13/site-packages/charset_normalizer/md.cpython-313-darwin.so filter=lfs diff=lfs merge=lfs -text
41
+ .venv/lib/python3.13/site-packages/charset_normalizer/md__mypyc.cpython-313-darwin.so filter=lfs diff=lfs merge=lfs -text
42
+ .venv/lib/python3.13/site-packages/pip/_vendor/distlib/t64-arm.exe filter=lfs diff=lfs merge=lfs -text
43
+ .venv/lib/python3.13/site-packages/pip/_vendor/distlib/t64.exe filter=lfs diff=lfs merge=lfs -text
44
+ .venv/lib/python3.13/site-packages/pip/_vendor/distlib/w64-arm.exe filter=lfs diff=lfs merge=lfs -text
45
+ .venv/lib/python3.13/site-packages/pip/_vendor/distlib/w64.exe filter=lfs diff=lfs merge=lfs -text
46
+ .venv/lib/python3.13/site-packages/yaml/_yaml.cpython-313-darwin.so filter=lfs diff=lfs merge=lfs -text
.venv/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *
.venv/bin/Activate.ps1 ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <#
2
+ .Synopsis
3
+ Activate a Python virtual environment for the current PowerShell session.
4
+
5
+ .Description
6
+ Pushes the python executable for a virtual environment to the front of the
7
+ $Env:PATH environment variable and sets the prompt to signify that you are
8
+ in a Python virtual environment. Makes use of the command line switches as
9
+ well as the `pyvenv.cfg` file values present in the virtual environment.
10
+
11
+ .Parameter VenvDir
12
+ Path to the directory that contains the virtual environment to activate. The
13
+ default value for this is the parent of the directory that the Activate.ps1
14
+ script is located within.
15
+
16
+ .Parameter Prompt
17
+ The prompt prefix to display when this virtual environment is activated. By
18
+ default, this prompt is the name of the virtual environment folder (VenvDir)
19
+ surrounded by parentheses and followed by a single space (ie. '(.venv) ').
20
+
21
+ .Example
22
+ Activate.ps1
23
+ Activates the Python virtual environment that contains the Activate.ps1 script.
24
+
25
+ .Example
26
+ Activate.ps1 -Verbose
27
+ Activates the Python virtual environment that contains the Activate.ps1 script,
28
+ and shows extra information about the activation as it executes.
29
+
30
+ .Example
31
+ Activate.ps1 -VenvDir C:\Users\MyUser\Common\.venv
32
+ Activates the Python virtual environment located in the specified location.
33
+
34
+ .Example
35
+ Activate.ps1 -Prompt "MyPython"
36
+ Activates the Python virtual environment that contains the Activate.ps1 script,
37
+ and prefixes the current prompt with the specified string (surrounded in
38
+ parentheses) while the virtual environment is active.
39
+
40
+ .Notes
41
+ On Windows, it may be required to enable this Activate.ps1 script by setting the
42
+ execution policy for the user. You can do this by issuing the following PowerShell
43
+ command:
44
+
45
+ PS C:\> Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
46
+
47
+ For more information on Execution Policies:
48
+ https://go.microsoft.com/fwlink/?LinkID=135170
49
+
50
+ #>
51
+ Param(
52
+ [Parameter(Mandatory = $false)]
53
+ [String]
54
+ $VenvDir,
55
+ [Parameter(Mandatory = $false)]
56
+ [String]
57
+ $Prompt
58
+ )
59
+
60
+ <# Function declarations --------------------------------------------------- #>
61
+
62
+ <#
63
+ .Synopsis
64
+ Remove all shell session elements added by the Activate script, including the
65
+ addition of the virtual environment's Python executable from the beginning of
66
+ the PATH variable.
67
+
68
+ .Parameter NonDestructive
69
+ If present, do not remove this function from the global namespace for the
70
+ session.
71
+
72
+ #>
73
+ function global:deactivate ([switch]$NonDestructive) {
74
+ # Revert to original values
75
+
76
+ # The prior prompt:
77
+ if (Test-Path -Path Function:_OLD_VIRTUAL_PROMPT) {
78
+ Copy-Item -Path Function:_OLD_VIRTUAL_PROMPT -Destination Function:prompt
79
+ Remove-Item -Path Function:_OLD_VIRTUAL_PROMPT
80
+ }
81
+
82
+ # The prior PYTHONHOME:
83
+ if (Test-Path -Path Env:_OLD_VIRTUAL_PYTHONHOME) {
84
+ Copy-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME -Destination Env:PYTHONHOME
85
+ Remove-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME
86
+ }
87
+
88
+ # The prior PATH:
89
+ if (Test-Path -Path Env:_OLD_VIRTUAL_PATH) {
90
+ Copy-Item -Path Env:_OLD_VIRTUAL_PATH -Destination Env:PATH
91
+ Remove-Item -Path Env:_OLD_VIRTUAL_PATH
92
+ }
93
+
94
+ # Just remove the VIRTUAL_ENV altogether:
95
+ if (Test-Path -Path Env:VIRTUAL_ENV) {
96
+ Remove-Item -Path env:VIRTUAL_ENV
97
+ }
98
+
99
+ # Just remove VIRTUAL_ENV_PROMPT altogether.
100
+ if (Test-Path -Path Env:VIRTUAL_ENV_PROMPT) {
101
+ Remove-Item -Path env:VIRTUAL_ENV_PROMPT
102
+ }
103
+
104
+ # Just remove the _PYTHON_VENV_PROMPT_PREFIX altogether:
105
+ if (Get-Variable -Name "_PYTHON_VENV_PROMPT_PREFIX" -ErrorAction SilentlyContinue) {
106
+ Remove-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Scope Global -Force
107
+ }
108
+
109
+ # Leave deactivate function in the global namespace if requested:
110
+ if (-not $NonDestructive) {
111
+ Remove-Item -Path function:deactivate
112
+ }
113
+ }
114
+
115
+ <#
116
+ .Description
117
+ Get-PyVenvConfig parses the values from the pyvenv.cfg file located in the
118
+ given folder, and returns them in a map.
119
+
120
+ For each line in the pyvenv.cfg file, if that line can be parsed into exactly
121
+ two strings separated by `=` (with any amount of whitespace surrounding the =)
122
+ then it is considered a `key = value` line. The left hand string is the key,
123
+ the right hand is the value.
124
+
125
+ If the value starts with a `'` or a `"` then the first and last character is
126
+ stripped from the value before being captured.
127
+
128
+ .Parameter ConfigDir
129
+ Path to the directory that contains the `pyvenv.cfg` file.
130
+ #>
131
+ function Get-PyVenvConfig(
132
+ [String]
133
+ $ConfigDir
134
+ ) {
135
+ Write-Verbose "Given ConfigDir=$ConfigDir, obtain values in pyvenv.cfg"
136
+
137
+ # Ensure the file exists, and issue a warning if it doesn't (but still allow the function to continue).
138
+ $pyvenvConfigPath = Join-Path -Resolve -Path $ConfigDir -ChildPath 'pyvenv.cfg' -ErrorAction Continue
139
+
140
+ # An empty map will be returned if no config file is found.
141
+ $pyvenvConfig = @{ }
142
+
143
+ if ($pyvenvConfigPath) {
144
+
145
+ Write-Verbose "File exists, parse `key = value` lines"
146
+ $pyvenvConfigContent = Get-Content -Path $pyvenvConfigPath
147
+
148
+ $pyvenvConfigContent | ForEach-Object {
149
+ $keyval = $PSItem -split "\s*=\s*", 2
150
+ if ($keyval[0] -and $keyval[1]) {
151
+ $val = $keyval[1]
152
+
153
+ # Remove extraneous quotations around a string value.
154
+ if ("'""".Contains($val.Substring(0, 1))) {
155
+ $val = $val.Substring(1, $val.Length - 2)
156
+ }
157
+
158
+ $pyvenvConfig[$keyval[0]] = $val
159
+ Write-Verbose "Adding Key: '$($keyval[0])'='$val'"
160
+ }
161
+ }
162
+ }
163
+ return $pyvenvConfig
164
+ }
165
+
166
+
167
+ <# Begin Activate script --------------------------------------------------- #>
168
+
169
+ # Determine the containing directory of this script
170
+ $VenvExecPath = Split-Path -Parent $MyInvocation.MyCommand.Definition
171
+ $VenvExecDir = Get-Item -Path $VenvExecPath
172
+
173
+ Write-Verbose "Activation script is located in path: '$VenvExecPath'"
174
+ Write-Verbose "VenvExecDir Fullname: '$($VenvExecDir.FullName)"
175
+ Write-Verbose "VenvExecDir Name: '$($VenvExecDir.Name)"
176
+
177
+ # Set values required in priority: CmdLine, ConfigFile, Default
178
+ # First, get the location of the virtual environment, it might not be
179
+ # VenvExecDir if specified on the command line.
180
+ if ($VenvDir) {
181
+ Write-Verbose "VenvDir given as parameter, using '$VenvDir' to determine values"
182
+ }
183
+ else {
184
+ Write-Verbose "VenvDir not given as a parameter, using parent directory name as VenvDir."
185
+ $VenvDir = $VenvExecDir.Parent.FullName.TrimEnd("\\/")
186
+ Write-Verbose "VenvDir=$VenvDir"
187
+ }
188
+
189
+ # Next, read the `pyvenv.cfg` file to determine any required value such
190
+ # as `prompt`.
191
+ $pyvenvCfg = Get-PyVenvConfig -ConfigDir $VenvDir
192
+
193
+ # Next, set the prompt from the command line, or the config file, or
194
+ # just use the name of the virtual environment folder.
195
+ if ($Prompt) {
196
+ Write-Verbose "Prompt specified as argument, using '$Prompt'"
197
+ }
198
+ else {
199
+ Write-Verbose "Prompt not specified as argument to script, checking pyvenv.cfg value"
200
+ if ($pyvenvCfg -and $pyvenvCfg['prompt']) {
201
+ Write-Verbose " Setting based on value in pyvenv.cfg='$($pyvenvCfg['prompt'])'"
202
+ $Prompt = $pyvenvCfg['prompt'];
203
+ }
204
+ else {
205
+ Write-Verbose " Setting prompt based on parent's directory's name. (Is the directory name passed to venv module when creating the virtual environment)"
206
+ Write-Verbose " Got leaf-name of $VenvDir='$(Split-Path -Path $venvDir -Leaf)'"
207
+ $Prompt = Split-Path -Path $venvDir -Leaf
208
+ }
209
+ }
210
+
211
+ Write-Verbose "Prompt = '$Prompt'"
212
+ Write-Verbose "VenvDir='$VenvDir'"
213
+
214
+ # Deactivate any currently active virtual environment, but leave the
215
+ # deactivate function in place.
216
+ deactivate -nondestructive
217
+
218
+ # Now set the environment variable VIRTUAL_ENV, used by many tools to determine
219
+ # that there is an activated venv.
220
+ $env:VIRTUAL_ENV = $VenvDir
221
+
222
+ $env:VIRTUAL_ENV_PROMPT = $Prompt
223
+
224
+ if (-not $Env:VIRTUAL_ENV_DISABLE_PROMPT) {
225
+
226
+ Write-Verbose "Setting prompt to '$Prompt'"
227
+
228
+ # Set the prompt to include the env name
229
+ # Make sure _OLD_VIRTUAL_PROMPT is global
230
+ function global:_OLD_VIRTUAL_PROMPT { "" }
231
+ Copy-Item -Path function:prompt -Destination function:_OLD_VIRTUAL_PROMPT
232
+ New-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Description "Python virtual environment prompt prefix" -Scope Global -Option ReadOnly -Visibility Public -Value $Prompt
233
+
234
+ function global:prompt {
235
+ Write-Host -NoNewline -ForegroundColor Green "($_PYTHON_VENV_PROMPT_PREFIX) "
236
+ _OLD_VIRTUAL_PROMPT
237
+ }
238
+ }
239
+
240
+ # Clear PYTHONHOME
241
+ if (Test-Path -Path Env:PYTHONHOME) {
242
+ Copy-Item -Path Env:PYTHONHOME -Destination Env:_OLD_VIRTUAL_PYTHONHOME
243
+ Remove-Item -Path Env:PYTHONHOME
244
+ }
245
+
246
+ # Add the venv to the PATH
247
+ Copy-Item -Path Env:PATH -Destination Env:_OLD_VIRTUAL_PATH
248
+ $Env:PATH = "$VenvExecDir$([System.IO.Path]::PathSeparator)$Env:PATH"
.venv/bin/activate ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file must be used with "source bin/activate" *from bash*
2
+ # You cannot run it directly
3
+
4
+ deactivate () {
5
+ # reset old environment variables
6
+ if [ -n "${_OLD_VIRTUAL_PATH:-}" ] ; then
7
+ PATH="${_OLD_VIRTUAL_PATH:-}"
8
+ export PATH
9
+ unset _OLD_VIRTUAL_PATH
10
+ fi
11
+ if [ -n "${_OLD_VIRTUAL_PYTHONHOME:-}" ] ; then
12
+ PYTHONHOME="${_OLD_VIRTUAL_PYTHONHOME:-}"
13
+ export PYTHONHOME
14
+ unset _OLD_VIRTUAL_PYTHONHOME
15
+ fi
16
+
17
+ # Call hash to forget past locations. Without forgetting
18
+ # past locations the $PATH changes we made may not be respected.
19
+ # See "man bash" for more details. hash is usually a builtin of your shell
20
+ hash -r 2> /dev/null
21
+
22
+ if [ -n "${_OLD_VIRTUAL_PS1:-}" ] ; then
23
+ PS1="${_OLD_VIRTUAL_PS1:-}"
24
+ export PS1
25
+ unset _OLD_VIRTUAL_PS1
26
+ fi
27
+
28
+ unset VIRTUAL_ENV
29
+ unset VIRTUAL_ENV_PROMPT
30
+ if [ ! "${1:-}" = "nondestructive" ] ; then
31
+ # Self destruct!
32
+ unset -f deactivate
33
+ fi
34
+ }
35
+
36
+ # unset irrelevant variables
37
+ deactivate nondestructive
38
+
39
+ # on Windows, a path can contain colons and backslashes and has to be converted:
40
+ case "$(uname)" in
41
+ CYGWIN*|MSYS*|MINGW*)
42
+ # transform D:\path\to\venv to /d/path/to/venv on MSYS and MINGW
43
+ # and to /cygdrive/d/path/to/venv on Cygwin
44
+ VIRTUAL_ENV=$(cygpath /Users/isaacdavid/Qwen2.5-3B-Instruct-Ori/.venv)
45
+ export VIRTUAL_ENV
46
+ ;;
47
+ *)
48
+ # use the path as-is
49
+ export VIRTUAL_ENV=/Users/isaacdavid/Qwen2.5-3B-Instruct-Ori/.venv
50
+ ;;
51
+ esac
52
+
53
+ _OLD_VIRTUAL_PATH="$PATH"
54
+ PATH="$VIRTUAL_ENV/"bin":$PATH"
55
+ export PATH
56
+
57
+ VIRTUAL_ENV_PROMPT=.venv
58
+ export VIRTUAL_ENV_PROMPT
59
+
60
+ # unset PYTHONHOME if set
61
+ # this will fail if PYTHONHOME is set to the empty string (which is bad anyway)
62
+ # could use `if (set -u; : $PYTHONHOME) ;` in bash
63
+ if [ -n "${PYTHONHOME:-}" ] ; then
64
+ _OLD_VIRTUAL_PYTHONHOME="${PYTHONHOME:-}"
65
+ unset PYTHONHOME
66
+ fi
67
+
68
+ if [ -z "${VIRTUAL_ENV_DISABLE_PROMPT:-}" ] ; then
69
+ _OLD_VIRTUAL_PS1="${PS1:-}"
70
+ PS1="(".venv") ${PS1:-}"
71
+ export PS1
72
+ fi
73
+
74
+ # Call hash to forget past commands. Without forgetting
75
+ # past commands the $PATH changes we made may not be respected
76
+ hash -r 2> /dev/null
.venv/bin/activate.csh ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file must be used with "source bin/activate.csh" *from csh*.
2
+ # You cannot run it directly.
3
+
4
+ # Created by Davide Di Blasi <[email protected]>.
5
+ # Ported to Python 3.3 venv by Andrew Svetlov <[email protected]>
6
+
7
+ alias deactivate 'test $?_OLD_VIRTUAL_PATH != 0 && setenv PATH "$_OLD_VIRTUAL_PATH" && unset _OLD_VIRTUAL_PATH; rehash; test $?_OLD_VIRTUAL_PROMPT != 0 && set prompt="$_OLD_VIRTUAL_PROMPT" && unset _OLD_VIRTUAL_PROMPT; unsetenv VIRTUAL_ENV; unsetenv VIRTUAL_ENV_PROMPT; test "\!:*" != "nondestructive" && unalias deactivate'
8
+
9
+ # Unset irrelevant variables.
10
+ deactivate nondestructive
11
+
12
+ setenv VIRTUAL_ENV /Users/isaacdavid/Qwen2.5-3B-Instruct-Ori/.venv
13
+
14
+ set _OLD_VIRTUAL_PATH="$PATH"
15
+ setenv PATH "$VIRTUAL_ENV/"bin":$PATH"
16
+ setenv VIRTUAL_ENV_PROMPT .venv
17
+
18
+
19
+ set _OLD_VIRTUAL_PROMPT="$prompt"
20
+
21
+ if (! "$?VIRTUAL_ENV_DISABLE_PROMPT") then
22
+ set prompt = "(".venv") $prompt:q"
23
+ endif
24
+
25
+ alias pydoc python -m pydoc
26
+
27
+ rehash
.venv/bin/activate.fish ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file must be used with "source <venv>/bin/activate.fish" *from fish*
2
+ # (https://fishshell.com/). You cannot run it directly.
3
+
4
+ function deactivate -d "Exit virtual environment and return to normal shell environment"
5
+ # reset old environment variables
6
+ if test -n "$_OLD_VIRTUAL_PATH"
7
+ set -gx PATH $_OLD_VIRTUAL_PATH
8
+ set -e _OLD_VIRTUAL_PATH
9
+ end
10
+ if test -n "$_OLD_VIRTUAL_PYTHONHOME"
11
+ set -gx PYTHONHOME $_OLD_VIRTUAL_PYTHONHOME
12
+ set -e _OLD_VIRTUAL_PYTHONHOME
13
+ end
14
+
15
+ if test -n "$_OLD_FISH_PROMPT_OVERRIDE"
16
+ set -e _OLD_FISH_PROMPT_OVERRIDE
17
+ # prevents error when using nested fish instances (Issue #93858)
18
+ if functions -q _old_fish_prompt
19
+ functions -e fish_prompt
20
+ functions -c _old_fish_prompt fish_prompt
21
+ functions -e _old_fish_prompt
22
+ end
23
+ end
24
+
25
+ set -e VIRTUAL_ENV
26
+ set -e VIRTUAL_ENV_PROMPT
27
+ if test "$argv[1]" != "nondestructive"
28
+ # Self-destruct!
29
+ functions -e deactivate
30
+ end
31
+ end
32
+
33
+ # Unset irrelevant variables.
34
+ deactivate nondestructive
35
+
36
+ set -gx VIRTUAL_ENV /Users/isaacdavid/Qwen2.5-3B-Instruct-Ori/.venv
37
+
38
+ set -gx _OLD_VIRTUAL_PATH $PATH
39
+ set -gx PATH "$VIRTUAL_ENV/"bin $PATH
40
+ set -gx VIRTUAL_ENV_PROMPT .venv
41
+
42
+ # Unset PYTHONHOME if set.
43
+ if set -q PYTHONHOME
44
+ set -gx _OLD_VIRTUAL_PYTHONHOME $PYTHONHOME
45
+ set -e PYTHONHOME
46
+ end
47
+
48
+ if test -z "$VIRTUAL_ENV_DISABLE_PROMPT"
49
+ # fish uses a function instead of an env var to generate the prompt.
50
+
51
+ # Save the current fish_prompt function as the function _old_fish_prompt.
52
+ functions -c fish_prompt _old_fish_prompt
53
+
54
+ # With the original prompt function renamed, we can override with our own.
55
+ function fish_prompt
56
+ # Save the return status of the last command.
57
+ set -l old_status $status
58
+
59
+ # Output the venv prompt; color taken from the blue of the Python logo.
60
+ printf "%s(%s)%s " (set_color 4B8BBE) .venv (set_color normal)
61
+
62
+ # Restore the return status of the previous command.
63
+ echo "exit $old_status" | .
64
+ # Output the original/"old" prompt.
65
+ _old_fish_prompt
66
+ end
67
+
68
+ set -gx _OLD_FISH_PROMPT_OVERRIDE "$VIRTUAL_ENV"
69
+ end
.venv/bin/huggingface-cli ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!/Users/isaacdavid/Qwen2.5-3B-Instruct-Ori/.venv/bin/python
2
+ # -*- coding: utf-8 -*-
3
+ import re
4
+ import sys
5
+ from huggingface_hub.commands.huggingface_cli import main
6
+ if __name__ == '__main__':
7
+ sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
8
+ sys.exit(main())
.venv/bin/normalizer ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!/Users/isaacdavid/Qwen2.5-3B-Instruct-Ori/.venv/bin/python
2
+ # -*- coding: utf-8 -*-
3
+ import re
4
+ import sys
5
+ from charset_normalizer import cli
6
+ if __name__ == '__main__':
7
+ sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
8
+ sys.exit(cli.cli_detect())
.venv/bin/pip ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!/Users/isaacdavid/Qwen2.5-3B-Instruct-Ori/.venv/bin/python
2
+ # -*- coding: utf-8 -*-
3
+ import re
4
+ import sys
5
+ from pip._internal.cli.main import main
6
+ if __name__ == '__main__':
7
+ sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
8
+ sys.exit(main())
.venv/bin/pip3 ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!/Users/isaacdavid/Qwen2.5-3B-Instruct-Ori/.venv/bin/python
2
+ # -*- coding: utf-8 -*-
3
+ import re
4
+ import sys
5
+ from pip._internal.cli.main import main
6
+ if __name__ == '__main__':
7
+ sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
8
+ sys.exit(main())
.venv/bin/pip3.13 ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!/Users/isaacdavid/Qwen2.5-3B-Instruct-Ori/.venv/bin/python
2
+ # -*- coding: utf-8 -*-
3
+ import re
4
+ import sys
5
+ from pip._internal.cli.main import main
6
+ if __name__ == '__main__':
7
+ sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
8
+ sys.exit(main())
.venv/bin/python ADDED
Binary file (52.6 kB). View file
 
.venv/bin/python3 ADDED
Binary file (52.6 kB). View file
 
.venv/bin/python3.13 ADDED
Binary file (52.6 kB). View file
 
.venv/bin/tqdm ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!/Users/isaacdavid/Qwen2.5-3B-Instruct-Ori/.venv/bin/python
2
+ # -*- coding: utf-8 -*-
3
+ import re
4
+ import sys
5
+ from tqdm.cli import main
6
+ if __name__ == '__main__':
7
+ sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
8
+ sys.exit(main())
.venv/lib/python3.13/site-packages/PyYAML-6.0.2.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ pip
.venv/lib/python3.13/site-packages/PyYAML-6.0.2.dist-info/LICENSE ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (c) 2017-2021 Ingy döt Net
2
+ Copyright (c) 2006-2016 Kirill Simonov
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
5
+ this software and associated documentation files (the "Software"), to deal in
6
+ the Software without restriction, including without limitation the rights to
7
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
8
+ of the Software, and to permit persons to whom the Software is furnished to do
9
+ so, subject to the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be included in all
12
+ copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20
+ SOFTWARE.
.venv/lib/python3.13/site-packages/PyYAML-6.0.2.dist-info/METADATA ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.1
2
+ Name: PyYAML
3
+ Version: 6.0.2
4
+ Summary: YAML parser and emitter for Python
5
+ Home-page: https://pyyaml.org/
6
+ Download-URL: https://pypi.org/project/PyYAML/
7
+ Author: Kirill Simonov
8
+ Author-email: [email protected]
9
+ License: MIT
10
+ Project-URL: Bug Tracker, https://github.com/yaml/pyyaml/issues
11
+ Project-URL: CI, https://github.com/yaml/pyyaml/actions
12
+ Project-URL: Documentation, https://pyyaml.org/wiki/PyYAMLDocumentation
13
+ Project-URL: Mailing lists, http://lists.sourceforge.net/lists/listinfo/yaml-core
14
+ Project-URL: Source Code, https://github.com/yaml/pyyaml
15
+ Platform: Any
16
+ Classifier: Development Status :: 5 - Production/Stable
17
+ Classifier: Intended Audience :: Developers
18
+ Classifier: License :: OSI Approved :: MIT License
19
+ Classifier: Operating System :: OS Independent
20
+ Classifier: Programming Language :: Cython
21
+ Classifier: Programming Language :: Python
22
+ Classifier: Programming Language :: Python :: 3
23
+ Classifier: Programming Language :: Python :: 3.8
24
+ Classifier: Programming Language :: Python :: 3.9
25
+ Classifier: Programming Language :: Python :: 3.10
26
+ Classifier: Programming Language :: Python :: 3.11
27
+ Classifier: Programming Language :: Python :: 3.12
28
+ Classifier: Programming Language :: Python :: 3.13
29
+ Classifier: Programming Language :: Python :: Implementation :: CPython
30
+ Classifier: Programming Language :: Python :: Implementation :: PyPy
31
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
32
+ Classifier: Topic :: Text Processing :: Markup
33
+ Requires-Python: >=3.8
34
+ License-File: LICENSE
35
+
36
+ YAML is a data serialization format designed for human readability
37
+ and interaction with scripting languages. PyYAML is a YAML parser
38
+ and emitter for Python.
39
+
40
+ PyYAML features a complete YAML 1.1 parser, Unicode support, pickle
41
+ support, capable extension API, and sensible error messages. PyYAML
42
+ supports standard YAML tags and provides Python-specific tags that
43
+ allow to represent an arbitrary Python object.
44
+
45
+ PyYAML is applicable for a broad range of tasks from complex
46
+ configuration files to object serialization and persistence.
.venv/lib/python3.13/site-packages/PyYAML-6.0.2.dist-info/RECORD ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PyYAML-6.0.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
2
+ PyYAML-6.0.2.dist-info/LICENSE,sha256=jTko-dxEkP1jVwfLiOsmvXZBAqcoKVQwfT5RZ6V36KQ,1101
3
+ PyYAML-6.0.2.dist-info/METADATA,sha256=9-odFB5seu4pGPcEv7E8iyxNF51_uKnaNGjLAhz2lto,2060
4
+ PyYAML-6.0.2.dist-info/RECORD,,
5
+ PyYAML-6.0.2.dist-info/WHEEL,sha256=9IiDymhRAZGpezdLosJoTs0FRVFmaCNfCbrLwpjM2to,110
6
+ PyYAML-6.0.2.dist-info/top_level.txt,sha256=rpj0IVMTisAjh_1vG3Ccf9v5jpCQwAz6cD1IVU5ZdhQ,11
7
+ _yaml/__init__.py,sha256=04Ae_5osxahpJHa3XBZUAf4wi6XX32gR8D6X6p64GEA,1402
8
+ _yaml/__pycache__/__init__.cpython-313.pyc,,
9
+ yaml/__init__.py,sha256=N35S01HMesFTe0aRRMWkPj0Pa8IEbHpE9FK7cr5Bdtw,12311
10
+ yaml/__pycache__/__init__.cpython-313.pyc,,
11
+ yaml/__pycache__/composer.cpython-313.pyc,,
12
+ yaml/__pycache__/constructor.cpython-313.pyc,,
13
+ yaml/__pycache__/cyaml.cpython-313.pyc,,
14
+ yaml/__pycache__/dumper.cpython-313.pyc,,
15
+ yaml/__pycache__/emitter.cpython-313.pyc,,
16
+ yaml/__pycache__/error.cpython-313.pyc,,
17
+ yaml/__pycache__/events.cpython-313.pyc,,
18
+ yaml/__pycache__/loader.cpython-313.pyc,,
19
+ yaml/__pycache__/nodes.cpython-313.pyc,,
20
+ yaml/__pycache__/parser.cpython-313.pyc,,
21
+ yaml/__pycache__/reader.cpython-313.pyc,,
22
+ yaml/__pycache__/representer.cpython-313.pyc,,
23
+ yaml/__pycache__/resolver.cpython-313.pyc,,
24
+ yaml/__pycache__/scanner.cpython-313.pyc,,
25
+ yaml/__pycache__/serializer.cpython-313.pyc,,
26
+ yaml/__pycache__/tokens.cpython-313.pyc,,
27
+ yaml/_yaml.cpython-313-darwin.so,sha256=BQ22sZBX0PgEZn-OJGbRD1MnakXV6RUiI8XbdJW6oK4,358728
28
+ yaml/composer.py,sha256=_Ko30Wr6eDWUeUpauUGT3Lcg9QPBnOPVlTnIMRGJ9FM,4883
29
+ yaml/constructor.py,sha256=kNgkfaeLUkwQYY_Q6Ff1Tz2XVw_pG1xVE9Ak7z-viLA,28639
30
+ yaml/cyaml.py,sha256=6ZrAG9fAYvdVe2FK_w0hmXoG7ZYsoYUwapG8CiC72H0,3851
31
+ yaml/dumper.py,sha256=PLctZlYwZLp7XmeUdwRuv4nYOZ2UBnDIUy8-lKfLF-o,2837
32
+ yaml/emitter.py,sha256=jghtaU7eFwg31bG0B7RZea_29Adi9CKmXq_QjgQpCkQ,43006
33
+ yaml/error.py,sha256=Ah9z-toHJUbE9j-M8YpxgSRM5CgLCcwVzJgLLRF2Fxo,2533
34
+ yaml/events.py,sha256=50_TksgQiE4up-lKo_V-nBy-tAIxkIPQxY5qDhKCeHw,2445
35
+ yaml/loader.py,sha256=UVa-zIqmkFSCIYq_PgSGm4NSJttHY2Rf_zQ4_b1fHN0,2061
36
+ yaml/nodes.py,sha256=gPKNj8pKCdh2d4gr3gIYINnPOaOxGhJAUiYhGRnPE84,1440
37
+ yaml/parser.py,sha256=ilWp5vvgoHFGzvOZDItFoGjD6D42nhlZrZyjAwa0oJo,25495
38
+ yaml/reader.py,sha256=0dmzirOiDG4Xo41RnuQS7K9rkY3xjHiVasfDMNTqCNw,6794
39
+ yaml/representer.py,sha256=IuWP-cAW9sHKEnS0gCqSa894k1Bg4cgTxaDwIcbRQ-Y,14190
40
+ yaml/resolver.py,sha256=9L-VYfm4mWHxUD1Vg4X7rjDRK_7VZd6b92wzq7Y2IKY,9004
41
+ yaml/scanner.py,sha256=YEM3iLZSaQwXcQRg2l2R4MdT0zGP2F9eHkKGKnHyWQY,51279
42
+ yaml/serializer.py,sha256=ChuFgmhU01hj4xgI8GaKv6vfM2Bujwa9i7d2FAHj7cA,4165
43
+ yaml/tokens.py,sha256=lTQIzSVw8Mg9wv459-TjiOQe6wVziqaRlqX2_89rp54,2573
.venv/lib/python3.13/site-packages/PyYAML-6.0.2.dist-info/WHEEL ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: bdist_wheel (0.44.0)
3
+ Root-Is-Purelib: false
4
+ Tag: cp313-cp313-macosx_11_0_arm64
5
+
.venv/lib/python3.13/site-packages/PyYAML-6.0.2.dist-info/top_level.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ _yaml
2
+ yaml
.venv/lib/python3.13/site-packages/_yaml/__init__.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is a stub package designed to roughly emulate the _yaml
2
+ # extension module, which previously existed as a standalone module
3
+ # and has been moved into the `yaml` package namespace.
4
+ # It does not perfectly mimic its old counterpart, but should get
5
+ # close enough for anyone who's relying on it even when they shouldn't.
6
+ import yaml
7
+
8
+ # in some circumstances, the yaml module we imoprted may be from a different version, so we need
9
+ # to tread carefully when poking at it here (it may not have the attributes we expect)
10
+ if not getattr(yaml, '__with_libyaml__', False):
11
+ from sys import version_info
12
+
13
+ exc = ModuleNotFoundError if version_info >= (3, 6) else ImportError
14
+ raise exc("No module named '_yaml'")
15
+ else:
16
+ from yaml._yaml import *
17
+ import warnings
18
+ warnings.warn(
19
+ 'The _yaml extension module is now located at yaml._yaml'
20
+ ' and its location is subject to change. To use the'
21
+ ' LibYAML-based parser and emitter, import from `yaml`:'
22
+ ' `from yaml import CLoader as Loader, CDumper as Dumper`.',
23
+ DeprecationWarning
24
+ )
25
+ del warnings
26
+ # Don't `del yaml` here because yaml is actually an existing
27
+ # namespace member of _yaml.
28
+
29
+ __name__ = '_yaml'
30
+ # If the module is top-level (i.e. not a part of any specific package)
31
+ # then the attribute should be set to ''.
32
+ # https://docs.python.org/3.8/library/types.html
33
+ __package__ = ''
.venv/lib/python3.13/site-packages/certifi-2025.1.31.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ pip
.venv/lib/python3.13/site-packages/certifi-2025.1.31.dist-info/LICENSE ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ This package contains a modified version of ca-bundle.crt:
2
+
3
+ ca-bundle.crt -- Bundle of CA Root Certificates
4
+
5
+ This is a bundle of X.509 certificates of public Certificate Authorities
6
+ (CA). These were automatically extracted from Mozilla's root certificates
7
+ file (certdata.txt). This file can be found in the mozilla source tree:
8
+ https://hg.mozilla.org/mozilla-central/file/tip/security/nss/lib/ckfw/builtins/certdata.txt
9
+ It contains the certificates in PEM format and therefore
10
+ can be directly used with curl / libcurl / php_curl, or with
11
+ an Apache+mod_ssl webserver for SSL client authentication.
12
+ Just configure this file as the SSLCACertificateFile.#
13
+
14
+ ***** BEGIN LICENSE BLOCK *****
15
+ This Source Code Form is subject to the terms of the Mozilla Public License,
16
+ v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain
17
+ one at http://mozilla.org/MPL/2.0/.
18
+
19
+ ***** END LICENSE BLOCK *****
20
+ @(#) $RCSfile: certdata.txt,v $ $Revision: 1.80 $ $Date: 2011/11/03 15:11:58 $
.venv/lib/python3.13/site-packages/certifi-2025.1.31.dist-info/METADATA ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.2
2
+ Name: certifi
3
+ Version: 2025.1.31
4
+ Summary: Python package for providing Mozilla's CA Bundle.
5
+ Home-page: https://github.com/certifi/python-certifi
6
+ Author: Kenneth Reitz
7
+ Author-email: [email protected]
8
+ License: MPL-2.0
9
+ Project-URL: Source, https://github.com/certifi/python-certifi
10
+ Classifier: Development Status :: 5 - Production/Stable
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)
13
+ Classifier: Natural Language :: English
14
+ Classifier: Programming Language :: Python
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3 :: Only
17
+ Classifier: Programming Language :: Python :: 3.6
18
+ Classifier: Programming Language :: Python :: 3.7
19
+ Classifier: Programming Language :: Python :: 3.8
20
+ Classifier: Programming Language :: Python :: 3.9
21
+ Classifier: Programming Language :: Python :: 3.10
22
+ Classifier: Programming Language :: Python :: 3.11
23
+ Classifier: Programming Language :: Python :: 3.12
24
+ Classifier: Programming Language :: Python :: 3.13
25
+ Requires-Python: >=3.6
26
+ License-File: LICENSE
27
+ Dynamic: author
28
+ Dynamic: author-email
29
+ Dynamic: classifier
30
+ Dynamic: description
31
+ Dynamic: home-page
32
+ Dynamic: license
33
+ Dynamic: project-url
34
+ Dynamic: requires-python
35
+ Dynamic: summary
36
+
37
+ Certifi: Python SSL Certificates
38
+ ================================
39
+
40
+ Certifi provides Mozilla's carefully curated collection of Root Certificates for
41
+ validating the trustworthiness of SSL certificates while verifying the identity
42
+ of TLS hosts. It has been extracted from the `Requests`_ project.
43
+
44
+ Installation
45
+ ------------
46
+
47
+ ``certifi`` is available on PyPI. Simply install it with ``pip``::
48
+
49
+ $ pip install certifi
50
+
51
+ Usage
52
+ -----
53
+
54
+ To reference the installed certificate authority (CA) bundle, you can use the
55
+ built-in function::
56
+
57
+ >>> import certifi
58
+
59
+ >>> certifi.where()
60
+ '/usr/local/lib/python3.7/site-packages/certifi/cacert.pem'
61
+
62
+ Or from the command line::
63
+
64
+ $ python -m certifi
65
+ /usr/local/lib/python3.7/site-packages/certifi/cacert.pem
66
+
67
+ Enjoy!
68
+
69
+ .. _`Requests`: https://requests.readthedocs.io/en/master/
70
+
71
+ Addition/Removal of Certificates
72
+ --------------------------------
73
+
74
+ Certifi does not support any addition/removal or other modification of the
75
+ CA trust store content. This project is intended to provide a reliable and
76
+ highly portable root of trust to python deployments. Look to upstream projects
77
+ for methods to use alternate trust.
.venv/lib/python3.13/site-packages/certifi-2025.1.31.dist-info/RECORD ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ certifi-2025.1.31.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
2
+ certifi-2025.1.31.dist-info/LICENSE,sha256=6TcW2mucDVpKHfYP5pWzcPBpVgPSH2-D8FPkLPwQyvc,989
3
+ certifi-2025.1.31.dist-info/METADATA,sha256=t5kcT5aGu0dQ6_psUNZYTqnC0uCRnponewm3uYjeHbg,2451
4
+ certifi-2025.1.31.dist-info/RECORD,,
5
+ certifi-2025.1.31.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
6
+ certifi-2025.1.31.dist-info/top_level.txt,sha256=KMu4vUCfsjLrkPbSNdgdekS-pVJzBAJFO__nI8NF6-U,8
7
+ certifi/__init__.py,sha256=neIaAf7BM36ygmQCmy-ZsSyjnvjWghFeu13wwEAnjj0,94
8
+ certifi/__main__.py,sha256=xBBoj905TUWBLRGANOcf7oi6e-3dMP4cEoG9OyMs11g,243
9
+ certifi/__pycache__/__init__.cpython-313.pyc,,
10
+ certifi/__pycache__/__main__.cpython-313.pyc,,
11
+ certifi/__pycache__/core.cpython-313.pyc,,
12
+ certifi/cacert.pem,sha256=xVsh-Qf3-G1IrdCTVS-1ZRdJ_1-GBQjMu0I9bB-9gMc,297255
13
+ certifi/core.py,sha256=qRDDFyXVJwTB_EmoGppaXU_R9qCZvhl-EzxPMuV3nTA,4426
14
+ certifi/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
.venv/lib/python3.13/site-packages/certifi-2025.1.31.dist-info/WHEEL ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (75.8.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
.venv/lib/python3.13/site-packages/certifi-2025.1.31.dist-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ certifi
.venv/lib/python3.13/site-packages/certifi/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .core import contents, where
2
+
3
+ __all__ = ["contents", "where"]
4
+ __version__ = "2025.01.31"
.venv/lib/python3.13/site-packages/certifi/__main__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+ from certifi import contents, where
4
+
5
+ parser = argparse.ArgumentParser()
6
+ parser.add_argument("-c", "--contents", action="store_true")
7
+ args = parser.parse_args()
8
+
9
+ if args.contents:
10
+ print(contents())
11
+ else:
12
+ print(where())
.venv/lib/python3.13/site-packages/certifi/cacert.pem ADDED
The diff for this file is too large to render. See raw diff
 
.venv/lib/python3.13/site-packages/certifi/core.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ certifi.py
3
+ ~~~~~~~~~~
4
+
5
+ This module returns the installation location of cacert.pem or its contents.
6
+ """
7
+ import sys
8
+ import atexit
9
+
10
+ def exit_cacert_ctx() -> None:
11
+ _CACERT_CTX.__exit__(None, None, None) # type: ignore[union-attr]
12
+
13
+
14
+ if sys.version_info >= (3, 11):
15
+
16
+ from importlib.resources import as_file, files
17
+
18
+ _CACERT_CTX = None
19
+ _CACERT_PATH = None
20
+
21
+ def where() -> str:
22
+ # This is slightly terrible, but we want to delay extracting the file
23
+ # in cases where we're inside of a zipimport situation until someone
24
+ # actually calls where(), but we don't want to re-extract the file
25
+ # on every call of where(), so we'll do it once then store it in a
26
+ # global variable.
27
+ global _CACERT_CTX
28
+ global _CACERT_PATH
29
+ if _CACERT_PATH is None:
30
+ # This is slightly janky, the importlib.resources API wants you to
31
+ # manage the cleanup of this file, so it doesn't actually return a
32
+ # path, it returns a context manager that will give you the path
33
+ # when you enter it and will do any cleanup when you leave it. In
34
+ # the common case of not needing a temporary file, it will just
35
+ # return the file system location and the __exit__() is a no-op.
36
+ #
37
+ # We also have to hold onto the actual context manager, because
38
+ # it will do the cleanup whenever it gets garbage collected, so
39
+ # we will also store that at the global level as well.
40
+ _CACERT_CTX = as_file(files("certifi").joinpath("cacert.pem"))
41
+ _CACERT_PATH = str(_CACERT_CTX.__enter__())
42
+ atexit.register(exit_cacert_ctx)
43
+
44
+ return _CACERT_PATH
45
+
46
+ def contents() -> str:
47
+ return files("certifi").joinpath("cacert.pem").read_text(encoding="ascii")
48
+
49
+ elif sys.version_info >= (3, 7):
50
+
51
+ from importlib.resources import path as get_path, read_text
52
+
53
+ _CACERT_CTX = None
54
+ _CACERT_PATH = None
55
+
56
+ def where() -> str:
57
+ # This is slightly terrible, but we want to delay extracting the
58
+ # file in cases where we're inside of a zipimport situation until
59
+ # someone actually calls where(), but we don't want to re-extract
60
+ # the file on every call of where(), so we'll do it once then store
61
+ # it in a global variable.
62
+ global _CACERT_CTX
63
+ global _CACERT_PATH
64
+ if _CACERT_PATH is None:
65
+ # This is slightly janky, the importlib.resources API wants you
66
+ # to manage the cleanup of this file, so it doesn't actually
67
+ # return a path, it returns a context manager that will give
68
+ # you the path when you enter it and will do any cleanup when
69
+ # you leave it. In the common case of not needing a temporary
70
+ # file, it will just return the file system location and the
71
+ # __exit__() is a no-op.
72
+ #
73
+ # We also have to hold onto the actual context manager, because
74
+ # it will do the cleanup whenever it gets garbage collected, so
75
+ # we will also store that at the global level as well.
76
+ _CACERT_CTX = get_path("certifi", "cacert.pem")
77
+ _CACERT_PATH = str(_CACERT_CTX.__enter__())
78
+ atexit.register(exit_cacert_ctx)
79
+
80
+ return _CACERT_PATH
81
+
82
+ def contents() -> str:
83
+ return read_text("certifi", "cacert.pem", encoding="ascii")
84
+
85
+ else:
86
+ import os
87
+ import types
88
+ from typing import Union
89
+
90
+ Package = Union[types.ModuleType, str]
91
+ Resource = Union[str, "os.PathLike"]
92
+
93
+ # This fallback will work for Python versions prior to 3.7 that lack the
94
+ # importlib.resources module but relies on the existing `where` function
95
+ # so won't address issues with environments like PyOxidizer that don't set
96
+ # __file__ on modules.
97
+ def read_text(
98
+ package: Package,
99
+ resource: Resource,
100
+ encoding: str = 'utf-8',
101
+ errors: str = 'strict'
102
+ ) -> str:
103
+ with open(where(), encoding=encoding) as data:
104
+ return data.read()
105
+
106
+ # If we don't have importlib.resources, then we will just do the old logic
107
+ # of assuming we're on the filesystem and munge the path directly.
108
+ def where() -> str:
109
+ f = os.path.dirname(__file__)
110
+
111
+ return os.path.join(f, "cacert.pem")
112
+
113
+ def contents() -> str:
114
+ return read_text("certifi", "cacert.pem", encoding="ascii")
.venv/lib/python3.13/site-packages/certifi/py.typed ADDED
File without changes
.venv/lib/python3.13/site-packages/charset_normalizer-3.4.1.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ pip
.venv/lib/python3.13/site-packages/charset_normalizer-3.4.1.dist-info/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 TAHRI Ahmed R.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
.venv/lib/python3.13/site-packages/charset_normalizer-3.4.1.dist-info/METADATA ADDED
@@ -0,0 +1,721 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.1
2
+ Name: charset-normalizer
3
+ Version: 3.4.1
4
+ Summary: The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.
5
+ Author-email: "Ahmed R. TAHRI" <[email protected]>
6
+ Maintainer-email: "Ahmed R. TAHRI" <[email protected]>
7
+ License: MIT
8
+ Project-URL: Changelog, https://github.com/jawah/charset_normalizer/blob/master/CHANGELOG.md
9
+ Project-URL: Documentation, https://charset-normalizer.readthedocs.io/
10
+ Project-URL: Code, https://github.com/jawah/charset_normalizer
11
+ Project-URL: Issue tracker, https://github.com/jawah/charset_normalizer/issues
12
+ Keywords: encoding,charset,charset-detector,detector,normalization,unicode,chardet,detect
13
+ Classifier: Development Status :: 5 - Production/Stable
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.7
20
+ Classifier: Programming Language :: Python :: 3.8
21
+ Classifier: Programming Language :: Python :: 3.9
22
+ Classifier: Programming Language :: Python :: 3.10
23
+ Classifier: Programming Language :: Python :: 3.11
24
+ Classifier: Programming Language :: Python :: 3.12
25
+ Classifier: Programming Language :: Python :: 3.13
26
+ Classifier: Programming Language :: Python :: 3 :: Only
27
+ Classifier: Programming Language :: Python :: Implementation :: CPython
28
+ Classifier: Programming Language :: Python :: Implementation :: PyPy
29
+ Classifier: Topic :: Text Processing :: Linguistic
30
+ Classifier: Topic :: Utilities
31
+ Classifier: Typing :: Typed
32
+ Requires-Python: >=3.7
33
+ Description-Content-Type: text/markdown
34
+ License-File: LICENSE
35
+ Provides-Extra: unicode-backport
36
+
37
+ <h1 align="center">Charset Detection, for Everyone 👋</h1>
38
+
39
+ <p align="center">
40
+ <sup>The Real First Universal Charset Detector</sup><br>
41
+ <a href="https://pypi.org/project/charset-normalizer">
42
+ <img src="https://img.shields.io/pypi/pyversions/charset_normalizer.svg?orange=blue" />
43
+ </a>
44
+ <a href="https://pepy.tech/project/charset-normalizer/">
45
+ <img alt="Download Count Total" src="https://static.pepy.tech/badge/charset-normalizer/month" />
46
+ </a>
47
+ <a href="https://bestpractices.coreinfrastructure.org/projects/7297">
48
+ <img src="https://bestpractices.coreinfrastructure.org/projects/7297/badge">
49
+ </a>
50
+ </p>
51
+ <p align="center">
52
+ <sup><i>Featured Packages</i></sup><br>
53
+ <a href="https://github.com/jawah/niquests">
54
+ <img alt="Static Badge" src="https://img.shields.io/badge/Niquests-Best_HTTP_Client-cyan">
55
+ </a>
56
+ <a href="https://github.com/jawah/wassima">
57
+ <img alt="Static Badge" src="https://img.shields.io/badge/Wassima-Certifi_Killer-cyan">
58
+ </a>
59
+ </p>
60
+ <p align="center">
61
+ <sup><i>In other language (unofficial port - by the community)</i></sup><br>
62
+ <a href="https://github.com/nickspring/charset-normalizer-rs">
63
+ <img alt="Static Badge" src="https://img.shields.io/badge/Rust-red">
64
+ </a>
65
+ </p>
66
+
67
+ > A library that helps you read text from an unknown charset encoding.<br /> Motivated by `chardet`,
68
+ > I'm trying to resolve the issue by taking a new approach.
69
+ > All IANA character set names for which the Python core library provides codecs are supported.
70
+
71
+ <p align="center">
72
+ >>>>> <a href="https://charsetnormalizerweb.ousret.now.sh" target="_blank">👉 Try Me Online Now, Then Adopt Me 👈 </a> <<<<<
73
+ </p>
74
+
75
+ This project offers you an alternative to **Universal Charset Encoding Detector**, also known as **Chardet**.
76
+
77
+ | Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) |
78
+ |--------------------------------------------------|:---------------------------------------------:|:--------------------------------------------------------------------------------------------------:|:-----------------------------------------------:|
79
+ | `Fast` | ❌ | ✅ | ✅ |
80
+ | `Universal**` | ❌ | ✅ | ❌ |
81
+ | `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ |
82
+ | `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ |
83
+ | `License` | LGPL-2.1<br>_restrictive_ | MIT | MPL-1.1<br>_restrictive_ |
84
+ | `Native Python` | ✅ | ✅ | ❌ |
85
+ | `Detect spoken language` | ❌ | ✅ | N/A |
86
+ | `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ |
87
+ | `Whl Size (min)` | 193.6 kB | 42 kB | ~200 kB |
88
+ | `Supported Encoding` | 33 | 🎉 [99](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40 |
89
+
90
+ <p align="center">
91
+ <img src="https://i.imgflip.com/373iay.gif" alt="Reading Normalized Text" width="226"/><img src="https://media.tenor.com/images/c0180f70732a18b4965448d33adba3d0/tenor.gif" alt="Cat Reading Text" width="200"/>
92
+ </p>
93
+
94
+ *\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*<br>
95
+
96
+ ## ⚡ Performance
97
+
98
+ This package offer better performance than its counterpart Chardet. Here are some numbers.
99
+
100
+ | Package | Accuracy | Mean per file (ms) | File per sec (est) |
101
+ |-----------------------------------------------|:--------:|:------------------:|:------------------:|
102
+ | [chardet](https://github.com/chardet/chardet) | 86 % | 63 ms | 16 file/sec |
103
+ | charset-normalizer | **98 %** | **10 ms** | 100 file/sec |
104
+
105
+ | Package | 99th percentile | 95th percentile | 50th percentile |
106
+ |-----------------------------------------------|:---------------:|:---------------:|:---------------:|
107
+ | [chardet](https://github.com/chardet/chardet) | 265 ms | 71 ms | 7 ms |
108
+ | charset-normalizer | 100 ms | 50 ms | 5 ms |
109
+
110
+ _updated as of december 2024 using CPython 3.12_
111
+
112
+ Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
113
+
114
+ > Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows.
115
+ > And yes, these results might change at any time. The dataset can be updated to include more files.
116
+ > The actual delays heavily depends on your CPU capabilities. The factors should remain the same.
117
+ > Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability
118
+ > (e.g. Supported Encoding) Challenge-them if you want.
119
+
120
+ ## ✨ Installation
121
+
122
+ Using pip:
123
+
124
+ ```sh
125
+ pip install charset-normalizer -U
126
+ ```
127
+
128
+ ## 🚀 Basic Usage
129
+
130
+ ### CLI
131
+ This package comes with a CLI.
132
+
133
+ ```
134
+ usage: normalizer [-h] [-v] [-a] [-n] [-m] [-r] [-f] [-t THRESHOLD]
135
+ file [file ...]
136
+
137
+ The Real First Universal Charset Detector. Discover originating encoding used
138
+ on text file. Normalize text to unicode.
139
+
140
+ positional arguments:
141
+ files File(s) to be analysed
142
+
143
+ optional arguments:
144
+ -h, --help show this help message and exit
145
+ -v, --verbose Display complementary information about file if any.
146
+ Stdout will contain logs about the detection process.
147
+ -a, --with-alternative
148
+ Output complementary possibilities if any. Top-level
149
+ JSON WILL be a list.
150
+ -n, --normalize Permit to normalize input file. If not set, program
151
+ does not write anything.
152
+ -m, --minimal Only output the charset detected to STDOUT. Disabling
153
+ JSON output.
154
+ -r, --replace Replace file when trying to normalize it instead of
155
+ creating a new one.
156
+ -f, --force Replace file without asking if you are sure, use this
157
+ flag with caution.
158
+ -t THRESHOLD, --threshold THRESHOLD
159
+ Define a custom maximum amount of chaos allowed in
160
+ decoded content. 0. <= chaos <= 1.
161
+ --version Show version information and exit.
162
+ ```
163
+
164
+ ```bash
165
+ normalizer ./data/sample.1.fr.srt
166
+ ```
167
+
168
+ or
169
+
170
+ ```bash
171
+ python -m charset_normalizer ./data/sample.1.fr.srt
172
+ ```
173
+
174
+ 🎉 Since version 1.4.0 the CLI produce easily usable stdout result in JSON format.
175
+
176
+ ```json
177
+ {
178
+ "path": "/home/default/projects/charset_normalizer/data/sample.1.fr.srt",
179
+ "encoding": "cp1252",
180
+ "encoding_aliases": [
181
+ "1252",
182
+ "windows_1252"
183
+ ],
184
+ "alternative_encodings": [
185
+ "cp1254",
186
+ "cp1256",
187
+ "cp1258",
188
+ "iso8859_14",
189
+ "iso8859_15",
190
+ "iso8859_16",
191
+ "iso8859_3",
192
+ "iso8859_9",
193
+ "latin_1",
194
+ "mbcs"
195
+ ],
196
+ "language": "French",
197
+ "alphabets": [
198
+ "Basic Latin",
199
+ "Latin-1 Supplement"
200
+ ],
201
+ "has_sig_or_bom": false,
202
+ "chaos": 0.149,
203
+ "coherence": 97.152,
204
+ "unicode_path": null,
205
+ "is_preferred": true
206
+ }
207
+ ```
208
+
209
+ ### Python
210
+ *Just print out normalized text*
211
+ ```python
212
+ from charset_normalizer import from_path
213
+
214
+ results = from_path('./my_subtitle.srt')
215
+
216
+ print(str(results.best()))
217
+ ```
218
+
219
+ *Upgrade your code without effort*
220
+ ```python
221
+ from charset_normalizer import detect
222
+ ```
223
+
224
+ The above code will behave the same as **chardet**. We ensure that we offer the best (reasonable) BC result possible.
225
+
226
+ See the docs for advanced usage : [readthedocs.io](https://charset-normalizer.readthedocs.io/en/latest/)
227
+
228
+ ## 😇 Why
229
+
230
+ When I started using Chardet, I noticed that it was not suited to my expectations, and I wanted to propose a
231
+ reliable alternative using a completely different method. Also! I never back down on a good challenge!
232
+
233
+ I **don't care** about the **originating charset** encoding, because **two different tables** can
234
+ produce **two identical rendered string.**
235
+ What I want is to get readable text, the best I can.
236
+
237
+ In a way, **I'm brute forcing text decoding.** How cool is that ? 😎
238
+
239
+ Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair Unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode.
240
+
241
+ ## 🍰 How
242
+
243
+ - Discard all charset encoding table that could not fit the binary content.
244
+ - Measure noise, or the mess once opened (by chunks) with a corresponding charset encoding.
245
+ - Extract matches with the lowest mess detected.
246
+ - Additionally, we measure coherence / probe for a language.
247
+
248
+ **Wait a minute**, what is noise/mess and coherence according to **YOU ?**
249
+
250
+ *Noise :* I opened hundred of text files, **written by humans**, with the wrong encoding table. **I observed**, then
251
+ **I established** some ground rules about **what is obvious** when **it seems like** a mess (aka. defining noise in rendered text).
252
+ I know that my interpretation of what is noise is probably incomplete, feel free to contribute in order to
253
+ improve or rewrite it.
254
+
255
+ *Coherence :* For each language there is on earth, we have computed ranked letter appearance occurrences (the best we can). So I thought
256
+ that intel is worth something here. So I use those records against decoded text to check if I can detect intelligent design.
257
+
258
+ ## ⚡ Known limitations
259
+
260
+ - Language detection is unreliable when text contains two or more languages sharing identical letters. (eg. HTML (english tags) + Turkish content (Sharing Latin characters))
261
+ - Every charset detector heavily depends on sufficient content. In common cases, do not bother run detection on very tiny content.
262
+
263
+ ## ⚠️ About Python EOLs
264
+
265
+ **If you are running:**
266
+
267
+ - Python >=2.7,<3.5: Unsupported
268
+ - Python 3.5: charset-normalizer < 2.1
269
+ - Python 3.6: charset-normalizer < 3.1
270
+ - Python 3.7: charset-normalizer < 4.0
271
+
272
+ Upgrade your Python interpreter as soon as possible.
273
+
274
+ ## 👤 Contributing
275
+
276
+ Contributions, issues and feature requests are very much welcome.<br />
277
+ Feel free to check [issues page](https://github.com/ousret/charset_normalizer/issues) if you want to contribute.
278
+
279
+ ## 📝 License
280
+
281
+ Copyright © [Ahmed TAHRI @Ousret](https://github.com/Ousret).<br />
282
+ This project is [MIT](https://github.com/Ousret/charset_normalizer/blob/master/LICENSE) licensed.
283
+
284
+ Characters frequencies used in this project © 2012 [Denny Vrandečić](http://simia.net/letters/)
285
+
286
+ ## 💼 For Enterprise
287
+
288
+ Professional support for charset-normalizer is available as part of the [Tidelift
289
+ Subscription][1]. Tidelift gives software development teams a single source for
290
+ purchasing and maintaining their software, with professional grade assurances
291
+ from the experts who know it best, while seamlessly integrating with existing
292
+ tools.
293
+
294
+ [1]: https://tidelift.com/subscription/pkg/pypi-charset-normalizer?utm_source=pypi-charset-normalizer&utm_medium=readme
295
+
296
+ [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/7297/badge)](https://www.bestpractices.dev/projects/7297)
297
+
298
+ # Changelog
299
+ All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
300
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
301
+
302
+ ## [3.4.1](https://github.com/Ousret/charset_normalizer/compare/3.4.0...3.4.1) (2024-12-24)
303
+
304
+ ### Changed
305
+ - Project metadata are now stored using `pyproject.toml` instead of `setup.cfg` using setuptools as the build backend.
306
+ - Enforce annotation delayed loading for a simpler and consistent types in the project.
307
+ - Optional mypyc compilation upgraded to version 1.14 for Python >= 3.8
308
+
309
+ ### Added
310
+ - pre-commit configuration.
311
+ - noxfile.
312
+
313
+ ### Removed
314
+ - `build-requirements.txt` as per using `pyproject.toml` native build configuration.
315
+ - `bin/integration.py` and `bin/serve.py` in favor of downstream integration test (see noxfile).
316
+ - `setup.cfg` in favor of `pyproject.toml` metadata configuration.
317
+ - Unused `utils.range_scan` function.
318
+
319
+ ### Fixed
320
+ - Converting content to Unicode bytes may insert `utf_8` instead of preferred `utf-8`. (#572)
321
+ - Deprecation warning "'count' is passed as positional argument" when converting to Unicode bytes on Python 3.13+
322
+
323
+ ## [3.4.0](https://github.com/Ousret/charset_normalizer/compare/3.3.2...3.4.0) (2024-10-08)
324
+
325
+ ### Added
326
+ - Argument `--no-preemptive` in the CLI to prevent the detector to search for hints.
327
+ - Support for Python 3.13 (#512)
328
+
329
+ ### Fixed
330
+ - Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch.
331
+ - Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407) (#537)
332
+ - Declared charset in content (preemptive detection) not changed when converting to utf-8 bytes. (#381)
333
+
334
+ ## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)
335
+
336
+ ### Fixed
337
+ - Unintentional memory usage regression when using large payload that match several encoding (#376)
338
+ - Regression on some detection case showcased in the documentation (#371)
339
+
340
+ ### Added
341
+ - Noise (md) probe that identify malformed arabic representation due to the presence of letters in isolated form (credit to my wife)
342
+
343
+ ## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-22)
344
+
345
+ ### Changed
346
+ - Optional mypyc compilation upgraded to version 1.6.1 for Python >= 3.8
347
+ - Improved the general detection reliability based on reports from the community
348
+
349
+ ## [3.3.0](https://github.com/Ousret/charset_normalizer/compare/3.2.0...3.3.0) (2023-09-30)
350
+
351
+ ### Added
352
+ - Allow to execute the CLI (e.g. normalizer) through `python -m charset_normalizer.cli` or `python -m charset_normalizer`
353
+ - Support for 9 forgotten encoding that are supported by Python but unlisted in `encoding.aliases` as they have no alias (#323)
354
+
355
+ ### Removed
356
+ - (internal) Redundant utils.is_ascii function and unused function is_private_use_only
357
+ - (internal) charset_normalizer.assets is moved inside charset_normalizer.constant
358
+
359
+ ### Changed
360
+ - (internal) Unicode code blocks in constants are updated using the latest v15.0.0 definition to improve detection
361
+ - Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.8
362
+
363
+ ### Fixed
364
+ - Unable to properly sort CharsetMatch when both chaos/noise and coherence were close due to an unreachable condition in \_\_lt\_\_ (#350)
365
+
366
+ ## [3.2.0](https://github.com/Ousret/charset_normalizer/compare/3.1.0...3.2.0) (2023-06-07)
367
+
368
+ ### Changed
369
+ - Typehint for function `from_path` no longer enforce `PathLike` as its first argument
370
+ - Minor improvement over the global detection reliability
371
+
372
+ ### Added
373
+ - Introduce function `is_binary` that relies on main capabilities, and optimized to detect binaries
374
+ - Propagate `enable_fallback` argument throughout `from_bytes`, `from_path`, and `from_fp` that allow a deeper control over the detection (default True)
375
+ - Explicit support for Python 3.12
376
+
377
+ ### Fixed
378
+ - Edge case detection failure where a file would contain 'very-long' camel cased word (Issue #289)
379
+
380
+ ## [3.1.0](https://github.com/Ousret/charset_normalizer/compare/3.0.1...3.1.0) (2023-03-06)
381
+
382
+ ### Added
383
+ - Argument `should_rename_legacy` for legacy function `detect` and disregard any new arguments without errors (PR #262)
384
+
385
+ ### Removed
386
+ - Support for Python 3.6 (PR #260)
387
+
388
+ ### Changed
389
+ - Optional speedup provided by mypy/c 1.0.1
390
+
391
+ ## [3.0.1](https://github.com/Ousret/charset_normalizer/compare/3.0.0...3.0.1) (2022-11-18)
392
+
393
+ ### Fixed
394
+ - Multi-bytes cutter/chunk generator did not always cut correctly (PR #233)
395
+
396
+ ### Changed
397
+ - Speedup provided by mypy/c 0.990 on Python >= 3.7
398
+
399
+ ## [3.0.0](https://github.com/Ousret/charset_normalizer/compare/2.1.1...3.0.0) (2022-10-20)
400
+
401
+ ### Added
402
+ - Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
403
+ - Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
404
+ - Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
405
+ - `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
406
+
407
+ ### Changed
408
+ - Build with static metadata using 'build' frontend
409
+ - Make the language detection stricter
410
+ - Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
411
+
412
+ ### Fixed
413
+ - CLI with opt --normalize fail when using full path for files
414
+ - TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
415
+ - Sphinx warnings when generating the documentation
416
+
417
+ ### Removed
418
+ - Coherence detector no longer return 'Simple English' instead return 'English'
419
+ - Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
420
+ - Breaking: Method `first()` and `best()` from CharsetMatch
421
+ - UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
422
+ - Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
423
+ - Breaking: Top-level function `normalize`
424
+ - Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
425
+ - Support for the backport `unicodedata2`
426
+
427
+ ## [3.0.0rc1](https://github.com/Ousret/charset_normalizer/compare/3.0.0b2...3.0.0rc1) (2022-10-18)
428
+
429
+ ### Added
430
+ - Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
431
+ - Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
432
+ - Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
433
+
434
+ ### Changed
435
+ - Build with static metadata using 'build' frontend
436
+ - Make the language detection stricter
437
+
438
+ ### Fixed
439
+ - CLI with opt --normalize fail when using full path for files
440
+ - TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
441
+
442
+ ### Removed
443
+ - Coherence detector no longer return 'Simple English' instead return 'English'
444
+ - Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
445
+
446
+ ## [3.0.0b2](https://github.com/Ousret/charset_normalizer/compare/3.0.0b1...3.0.0b2) (2022-08-21)
447
+
448
+ ### Added
449
+ - `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
450
+
451
+ ### Removed
452
+ - Breaking: Method `first()` and `best()` from CharsetMatch
453
+ - UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
454
+
455
+ ### Fixed
456
+ - Sphinx warnings when generating the documentation
457
+
458
+ ## [3.0.0b1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...3.0.0b1) (2022-08-15)
459
+
460
+ ### Changed
461
+ - Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
462
+
463
+ ### Removed
464
+ - Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
465
+ - Breaking: Top-level function `normalize`
466
+ - Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
467
+ - Support for the backport `unicodedata2`
468
+
469
+ ## [2.1.1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...2.1.1) (2022-08-19)
470
+
471
+ ### Deprecated
472
+ - Function `normalize` scheduled for removal in 3.0
473
+
474
+ ### Changed
475
+ - Removed useless call to decode in fn is_unprintable (#206)
476
+
477
+ ### Fixed
478
+ - Third-party library (i18n xgettext) crashing not recognizing utf_8 (PEP 263) with underscore from [@aleksandernovikov](https://github.com/aleksandernovikov) (#204)
479
+
480
+ ## [2.1.0](https://github.com/Ousret/charset_normalizer/compare/2.0.12...2.1.0) (2022-06-19)
481
+
482
+ ### Added
483
+ - Output the Unicode table version when running the CLI with `--version` (PR #194)
484
+
485
+ ### Changed
486
+ - Re-use decoded buffer for single byte character sets from [@nijel](https://github.com/nijel) (PR #175)
487
+ - Fixing some performance bottlenecks from [@deedy5](https://github.com/deedy5) (PR #183)
488
+
489
+ ### Fixed
490
+ - Workaround potential bug in cpython with Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space (PR #175)
491
+ - CLI default threshold aligned with the API threshold from [@oleksandr-kuzmenko](https://github.com/oleksandr-kuzmenko) (PR #181)
492
+
493
+ ### Removed
494
+ - Support for Python 3.5 (PR #192)
495
+
496
+ ### Deprecated
497
+ - Use of backport unicodedata from `unicodedata2` as Python is quickly catching up, scheduled for removal in 3.0 (PR #194)
498
+
499
+ ## [2.0.12](https://github.com/Ousret/charset_normalizer/compare/2.0.11...2.0.12) (2022-02-12)
500
+
501
+ ### Fixed
502
+ - ASCII miss-detection on rare cases (PR #170)
503
+
504
+ ## [2.0.11](https://github.com/Ousret/charset_normalizer/compare/2.0.10...2.0.11) (2022-01-30)
505
+
506
+ ### Added
507
+ - Explicit support for Python 3.11 (PR #164)
508
+
509
+ ### Changed
510
+ - The logging behavior have been completely reviewed, now using only TRACE and DEBUG levels (PR #163 #165)
511
+
512
+ ## [2.0.10](https://github.com/Ousret/charset_normalizer/compare/2.0.9...2.0.10) (2022-01-04)
513
+
514
+ ### Fixed
515
+ - Fallback match entries might lead to UnicodeDecodeError for large bytes sequence (PR #154)
516
+
517
+ ### Changed
518
+ - Skipping the language-detection (CD) on ASCII (PR #155)
519
+
520
+ ## [2.0.9](https://github.com/Ousret/charset_normalizer/compare/2.0.8...2.0.9) (2021-12-03)
521
+
522
+ ### Changed
523
+ - Moderating the logging impact (since 2.0.8) for specific environments (PR #147)
524
+
525
+ ### Fixed
526
+ - Wrong logging level applied when setting kwarg `explain` to True (PR #146)
527
+
528
+ ## [2.0.8](https://github.com/Ousret/charset_normalizer/compare/2.0.7...2.0.8) (2021-11-24)
529
+ ### Changed
530
+ - Improvement over Vietnamese detection (PR #126)
531
+ - MD improvement on trailing data and long foreign (non-pure latin) data (PR #124)
532
+ - Efficiency improvements in cd/alphabet_languages from [@adbar](https://github.com/adbar) (PR #122)
533
+ - call sum() without an intermediary list following PEP 289 recommendations from [@adbar](https://github.com/adbar) (PR #129)
534
+ - Code style as refactored by Sourcery-AI (PR #131)
535
+ - Minor adjustment on the MD around european words (PR #133)
536
+ - Remove and replace SRTs from assets / tests (PR #139)
537
+ - Initialize the library logger with a `NullHandler` by default from [@nmaynes](https://github.com/nmaynes) (PR #135)
538
+ - Setting kwarg `explain` to True will add provisionally (bounded to function lifespan) a specific stream handler (PR #135)
539
+
540
+ ### Fixed
541
+ - Fix large (misleading) sequence giving UnicodeDecodeError (PR #137)
542
+ - Avoid using too insignificant chunk (PR #137)
543
+
544
+ ### Added
545
+ - Add and expose function `set_logging_handler` to configure a specific StreamHandler from [@nmaynes](https://github.com/nmaynes) (PR #135)
546
+ - Add `CHANGELOG.md` entries, format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) (PR #141)
547
+
548
+ ## [2.0.7](https://github.com/Ousret/charset_normalizer/compare/2.0.6...2.0.7) (2021-10-11)
549
+ ### Added
550
+ - Add support for Kazakh (Cyrillic) language detection (PR #109)
551
+
552
+ ### Changed
553
+ - Further, improve inferring the language from a given single-byte code page (PR #112)
554
+ - Vainly trying to leverage PEP263 when PEP3120 is not supported (PR #116)
555
+ - Refactoring for potential performance improvements in loops from [@adbar](https://github.com/adbar) (PR #113)
556
+ - Various detection improvement (MD+CD) (PR #117)
557
+
558
+ ### Removed
559
+ - Remove redundant logging entry about detected language(s) (PR #115)
560
+
561
+ ### Fixed
562
+ - Fix a minor inconsistency between Python 3.5 and other versions regarding language detection (PR #117 #102)
563
+
564
+ ## [2.0.6](https://github.com/Ousret/charset_normalizer/compare/2.0.5...2.0.6) (2021-09-18)
565
+ ### Fixed
566
+ - Unforeseen regression with the loss of the backward-compatibility with some older minor of Python 3.5.x (PR #100)
567
+ - Fix CLI crash when using --minimal output in certain cases (PR #103)
568
+
569
+ ### Changed
570
+ - Minor improvement to the detection efficiency (less than 1%) (PR #106 #101)
571
+
572
+ ## [2.0.5](https://github.com/Ousret/charset_normalizer/compare/2.0.4...2.0.5) (2021-09-14)
573
+ ### Changed
574
+ - The project now comply with: flake8, mypy, isort and black to ensure a better overall quality (PR #81)
575
+ - The BC-support with v1.x was improved, the old staticmethods are restored (PR #82)
576
+ - The Unicode detection is slightly improved (PR #93)
577
+ - Add syntax sugar \_\_bool\_\_ for results CharsetMatches list-container (PR #91)
578
+
579
+ ### Removed
580
+ - The project no longer raise warning on tiny content given for detection, will be simply logged as warning instead (PR #92)
581
+
582
+ ### Fixed
583
+ - In some rare case, the chunks extractor could cut in the middle of a multi-byte character and could mislead the mess detection (PR #95)
584
+ - Some rare 'space' characters could trip up the UnprintablePlugin/Mess detection (PR #96)
585
+ - The MANIFEST.in was not exhaustive (PR #78)
586
+
587
+ ## [2.0.4](https://github.com/Ousret/charset_normalizer/compare/2.0.3...2.0.4) (2021-07-30)
588
+ ### Fixed
589
+ - The CLI no longer raise an unexpected exception when no encoding has been found (PR #70)
590
+ - Fix accessing the 'alphabets' property when the payload contains surrogate characters (PR #68)
591
+ - The logger could mislead (explain=True) on detected languages and the impact of one MBCS match (PR #72)
592
+ - Submatch factoring could be wrong in rare edge cases (PR #72)
593
+ - Multiple files given to the CLI were ignored when publishing results to STDOUT. (After the first path) (PR #72)
594
+ - Fix line endings from CRLF to LF for certain project files (PR #67)
595
+
596
+ ### Changed
597
+ - Adjust the MD to lower the sensitivity, thus improving the global detection reliability (PR #69 #76)
598
+ - Allow fallback on specified encoding if any (PR #71)
599
+
600
+ ## [2.0.3](https://github.com/Ousret/charset_normalizer/compare/2.0.2...2.0.3) (2021-07-16)
601
+ ### Changed
602
+ - Part of the detection mechanism has been improved to be less sensitive, resulting in more accurate detection results. Especially ASCII. (PR #63)
603
+ - According to the community wishes, the detection will fall back on ASCII or UTF-8 in a last-resort case. (PR #64)
604
+
605
+ ## [2.0.2](https://github.com/Ousret/charset_normalizer/compare/2.0.1...2.0.2) (2021-07-15)
606
+ ### Fixed
607
+ - Empty/Too small JSON payload miss-detection fixed. Report from [@tseaver](https://github.com/tseaver) (PR #59)
608
+
609
+ ### Changed
610
+ - Don't inject unicodedata2 into sys.modules from [@akx](https://github.com/akx) (PR #57)
611
+
612
+ ## [2.0.1](https://github.com/Ousret/charset_normalizer/compare/2.0.0...2.0.1) (2021-07-13)
613
+ ### Fixed
614
+ - Make it work where there isn't a filesystem available, dropping assets frequencies.json. Report from [@sethmlarson](https://github.com/sethmlarson). (PR #55)
615
+ - Using explain=False permanently disable the verbose output in the current runtime (PR #47)
616
+ - One log entry (language target preemptive) was not show in logs when using explain=True (PR #47)
617
+ - Fix undesired exception (ValueError) on getitem of instance CharsetMatches (PR #52)
618
+
619
+ ### Changed
620
+ - Public function normalize default args values were not aligned with from_bytes (PR #53)
621
+
622
+ ### Added
623
+ - You may now use charset aliases in cp_isolation and cp_exclusion arguments (PR #47)
624
+
625
+ ## [2.0.0](https://github.com/Ousret/charset_normalizer/compare/1.4.1...2.0.0) (2021-07-02)
626
+ ### Changed
627
+ - 4x to 5 times faster than the previous 1.4.0 release. At least 2x faster than Chardet.
628
+ - Accent has been made on UTF-8 detection, should perform rather instantaneous.
629
+ - The backward compatibility with Chardet has been greatly improved. The legacy detect function returns an identical charset name whenever possible.
630
+ - The detection mechanism has been slightly improved, now Turkish content is detected correctly (most of the time)
631
+ - The program has been rewritten to ease the readability and maintainability. (+Using static typing)+
632
+ - utf_7 detection has been reinstated.
633
+
634
+ ### Removed
635
+ - This package no longer require anything when used with Python 3.5 (Dropped cached_property)
636
+ - Removed support for these languages: Catalan, Esperanto, Kazakh, Baque, Volapük, Azeri, Galician, Nynorsk, Macedonian, and Serbocroatian.
637
+ - The exception hook on UnicodeDecodeError has been removed.
638
+
639
+ ### Deprecated
640
+ - Methods coherence_non_latin, w_counter, chaos_secondary_pass of the class CharsetMatch are now deprecated and scheduled for removal in v3.0
641
+
642
+ ### Fixed
643
+ - The CLI output used the relative path of the file(s). Should be absolute.
644
+
645
+ ## [1.4.1](https://github.com/Ousret/charset_normalizer/compare/1.4.0...1.4.1) (2021-05-28)
646
+ ### Fixed
647
+ - Logger configuration/usage no longer conflict with others (PR #44)
648
+
649
+ ## [1.4.0](https://github.com/Ousret/charset_normalizer/compare/1.3.9...1.4.0) (2021-05-21)
650
+ ### Removed
651
+ - Using standard logging instead of using the package loguru.
652
+ - Dropping nose test framework in favor of the maintained pytest.
653
+ - Choose to not use dragonmapper package to help with gibberish Chinese/CJK text.
654
+ - Require cached_property only for Python 3.5 due to constraint. Dropping for every other interpreter version.
655
+ - Stop support for UTF-7 that does not contain a SIG.
656
+ - Dropping PrettyTable, replaced with pure JSON output in CLI.
657
+
658
+ ### Fixed
659
+ - BOM marker in a CharsetNormalizerMatch instance could be False in rare cases even if obviously present. Due to the sub-match factoring process.
660
+ - Not searching properly for the BOM when trying utf32/16 parent codec.
661
+
662
+ ### Changed
663
+ - Improving the package final size by compressing frequencies.json.
664
+ - Huge improvement over the larges payload.
665
+
666
+ ### Added
667
+ - CLI now produces JSON consumable output.
668
+ - Return ASCII if given sequences fit. Given reasonable confidence.
669
+
670
+ ## [1.3.9](https://github.com/Ousret/charset_normalizer/compare/1.3.8...1.3.9) (2021-05-13)
671
+
672
+ ### Fixed
673
+ - In some very rare cases, you may end up getting encode/decode errors due to a bad bytes payload (PR #40)
674
+
675
+ ## [1.3.8](https://github.com/Ousret/charset_normalizer/compare/1.3.7...1.3.8) (2021-05-12)
676
+
677
+ ### Fixed
678
+ - Empty given payload for detection may cause an exception if trying to access the `alphabets` property. (PR #39)
679
+
680
+ ## [1.3.7](https://github.com/Ousret/charset_normalizer/compare/1.3.6...1.3.7) (2021-05-12)
681
+
682
+ ### Fixed
683
+ - The legacy detect function should return UTF-8-SIG if sig is present in the payload. (PR #38)
684
+
685
+ ## [1.3.6](https://github.com/Ousret/charset_normalizer/compare/1.3.5...1.3.6) (2021-02-09)
686
+
687
+ ### Changed
688
+ - Amend the previous release to allow prettytable 2.0 (PR #35)
689
+
690
+ ## [1.3.5](https://github.com/Ousret/charset_normalizer/compare/1.3.4...1.3.5) (2021-02-08)
691
+
692
+ ### Fixed
693
+ - Fix error while using the package with a python pre-release interpreter (PR #33)
694
+
695
+ ### Changed
696
+ - Dependencies refactoring, constraints revised.
697
+
698
+ ### Added
699
+ - Add python 3.9 and 3.10 to the supported interpreters
700
+
701
+ MIT License
702
+
703
+ Copyright (c) 2025 TAHRI Ahmed R.
704
+
705
+ Permission is hereby granted, free of charge, to any person obtaining a copy
706
+ of this software and associated documentation files (the "Software"), to deal
707
+ in the Software without restriction, including without limitation the rights
708
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
709
+ copies of the Software, and to permit persons to whom the Software is
710
+ furnished to do so, subject to the following conditions:
711
+
712
+ The above copyright notice and this permission notice shall be included in all
713
+ copies or substantial portions of the Software.
714
+
715
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
716
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
717
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
718
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
719
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
720
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
721
+ SOFTWARE.
.venv/lib/python3.13/site-packages/charset_normalizer-3.4.1.dist-info/RECORD ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ../../../bin/normalizer,sha256=EEF2YGZQp06ajmmHxz5vp4FlVLcMI9diMu84EJnUtis,268
2
+ charset_normalizer-3.4.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
3
+ charset_normalizer-3.4.1.dist-info/LICENSE,sha256=bQ1Bv-FwrGx9wkjJpj4lTQ-0WmDVCoJX0K-SxuJJuIc,1071
4
+ charset_normalizer-3.4.1.dist-info/METADATA,sha256=JbyHzhmqZh_ugEn1Y7TY7CDYZA9FoU6BP25hrCNDf50,35313
5
+ charset_normalizer-3.4.1.dist-info/RECORD,,
6
+ charset_normalizer-3.4.1.dist-info/WHEEL,sha256=8V5JjwATQfL0d9dd03DBSgMiHY8GVsHwXVusby9L_SY,115
7
+ charset_normalizer-3.4.1.dist-info/entry_points.txt,sha256=8C-Y3iXIfyXQ83Tpir2B8t-XLJYpxF5xbb38d_js-h4,65
8
+ charset_normalizer-3.4.1.dist-info/top_level.txt,sha256=7ASyzePr8_xuZWJsnqJjIBtyV8vhEo0wBCv1MPRRi3Q,19
9
+ charset_normalizer/__init__.py,sha256=OKRxRv2Zhnqk00tqkN0c1BtJjm165fWXLydE52IKuHc,1590
10
+ charset_normalizer/__main__.py,sha256=yzYxMR-IhKRHYwcSlavEv8oGdwxsR89mr2X09qXGdps,109
11
+ charset_normalizer/__pycache__/__init__.cpython-313.pyc,,
12
+ charset_normalizer/__pycache__/__main__.cpython-313.pyc,,
13
+ charset_normalizer/__pycache__/api.cpython-313.pyc,,
14
+ charset_normalizer/__pycache__/cd.cpython-313.pyc,,
15
+ charset_normalizer/__pycache__/constant.cpython-313.pyc,,
16
+ charset_normalizer/__pycache__/legacy.cpython-313.pyc,,
17
+ charset_normalizer/__pycache__/md.cpython-313.pyc,,
18
+ charset_normalizer/__pycache__/models.cpython-313.pyc,,
19
+ charset_normalizer/__pycache__/utils.cpython-313.pyc,,
20
+ charset_normalizer/__pycache__/version.cpython-313.pyc,,
21
+ charset_normalizer/api.py,sha256=qBRz8mJ_R5E713R6TOyqHEdnmyxbEDnCSHvx32ubDGg,22617
22
+ charset_normalizer/cd.py,sha256=WKTo1HDb-H9HfCDc3Bfwq5jzS25Ziy9SE2a74SgTq88,12522
23
+ charset_normalizer/cli/__init__.py,sha256=D8I86lFk2-py45JvqxniTirSj_sFyE6sjaY_0-G1shc,136
24
+ charset_normalizer/cli/__main__.py,sha256=VGC9klOoi6_R2z8rmyrc936kv7u2A1udjjHtlmNPDTM,10410
25
+ charset_normalizer/cli/__pycache__/__init__.cpython-313.pyc,,
26
+ charset_normalizer/cli/__pycache__/__main__.cpython-313.pyc,,
27
+ charset_normalizer/constant.py,sha256=4VuTcZNLew1j_8ixA-Rt_VVqNWD4pwgHOHMCMlr0964,40477
28
+ charset_normalizer/legacy.py,sha256=yhNXsPHkBfqPXKRb-sPXNj3Bscp9-mFGcYOkJ62tg9c,2328
29
+ charset_normalizer/md.cpython-313-darwin.so,sha256=mUvSZMr6ty3_yLX68YZ5BJQr91RHfsfhiJCWQCFBMnE,115664
30
+ charset_normalizer/md.py,sha256=iyXXQGWl54nnLQLueMWTmUtlivO0-rTBgVkmJxIIAGU,20036
31
+ charset_normalizer/md__mypyc.cpython-313-darwin.so,sha256=yf7dzVSgLFK-NZBo4MXVDlEpN4VoXWXFuLSbUTd0XFw,482184
32
+ charset_normalizer/models.py,sha256=lKXhOnIPtiakbK3i__J9wpOfzx3JDTKj7Dn3Rg0VaRI,12394
33
+ charset_normalizer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
+ charset_normalizer/utils.py,sha256=T5UHo8AS7NVMmgruWoZyqEf0WrZVcQpgUNetRoborSk,12002
35
+ charset_normalizer/version.py,sha256=Ambcj3O8FfvdLfDLc8dkaxZx97O1IM_R4_aKGD_TDdE,115
.venv/lib/python3.13/site-packages/charset_normalizer-3.4.1.dist-info/WHEEL ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (75.6.0)
3
+ Root-Is-Purelib: false
4
+ Tag: cp313-cp313-macosx_10_13_universal2
5
+
.venv/lib/python3.13/site-packages/charset_normalizer-3.4.1.dist-info/entry_points.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [console_scripts]
2
+ normalizer = charset_normalizer:cli.cli_detect
.venv/lib/python3.13/site-packages/charset_normalizer-3.4.1.dist-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ charset_normalizer
.venv/lib/python3.13/site-packages/charset_normalizer/__init__.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Charset-Normalizer
3
+ ~~~~~~~~~~~~~~
4
+ The Real First Universal Charset Detector.
5
+ A library that helps you read text from an unknown charset encoding.
6
+ Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
7
+ All IANA character set names for which the Python core library provides codecs are supported.
8
+
9
+ Basic usage:
10
+ >>> from charset_normalizer import from_bytes
11
+ >>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
12
+ >>> best_guess = results.best()
13
+ >>> str(best_guess)
14
+ 'Bсеки човек има право на образование. Oбразованието!'
15
+
16
+ Others methods and usages are available - see the full documentation
17
+ at <https://github.com/Ousret/charset_normalizer>.
18
+ :copyright: (c) 2021 by Ahmed TAHRI
19
+ :license: MIT, see LICENSE for more details.
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import logging
25
+
26
+ from .api import from_bytes, from_fp, from_path, is_binary
27
+ from .legacy import detect
28
+ from .models import CharsetMatch, CharsetMatches
29
+ from .utils import set_logging_handler
30
+ from .version import VERSION, __version__
31
+
32
+ __all__ = (
33
+ "from_fp",
34
+ "from_path",
35
+ "from_bytes",
36
+ "is_binary",
37
+ "detect",
38
+ "CharsetMatch",
39
+ "CharsetMatches",
40
+ "__version__",
41
+ "VERSION",
42
+ "set_logging_handler",
43
+ )
44
+
45
+ # Attach a NullHandler to the top level logger by default
46
+ # https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
47
+
48
+ logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())
.venv/lib/python3.13/site-packages/charset_normalizer/__main__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from .cli import cli_detect
4
+
5
+ if __name__ == "__main__":
6
+ cli_detect()
.venv/lib/python3.13/site-packages/charset_normalizer/api.py ADDED
@@ -0,0 +1,668 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from os import PathLike
5
+ from typing import BinaryIO
6
+
7
+ from .cd import (
8
+ coherence_ratio,
9
+ encoding_languages,
10
+ mb_encoding_languages,
11
+ merge_coherence_ratios,
12
+ )
13
+ from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
14
+ from .md import mess_ratio
15
+ from .models import CharsetMatch, CharsetMatches
16
+ from .utils import (
17
+ any_specified_encoding,
18
+ cut_sequence_chunks,
19
+ iana_name,
20
+ identify_sig_or_bom,
21
+ is_cp_similar,
22
+ is_multi_byte_encoding,
23
+ should_strip_sig_or_bom,
24
+ )
25
+
26
+ logger = logging.getLogger("charset_normalizer")
27
+ explain_handler = logging.StreamHandler()
28
+ explain_handler.setFormatter(
29
+ logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
30
+ )
31
+
32
+
33
+ def from_bytes(
34
+ sequences: bytes | bytearray,
35
+ steps: int = 5,
36
+ chunk_size: int = 512,
37
+ threshold: float = 0.2,
38
+ cp_isolation: list[str] | None = None,
39
+ cp_exclusion: list[str] | None = None,
40
+ preemptive_behaviour: bool = True,
41
+ explain: bool = False,
42
+ language_threshold: float = 0.1,
43
+ enable_fallback: bool = True,
44
+ ) -> CharsetMatches:
45
+ """
46
+ Given a raw bytes sequence, return the best possibles charset usable to render str objects.
47
+ If there is no results, it is a strong indicator that the source is binary/not text.
48
+ By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
49
+ And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
50
+
51
+ The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
52
+ but never take it for granted. Can improve the performance.
53
+
54
+ You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
55
+ purpose.
56
+
57
+ This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
58
+ By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
59
+ toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
60
+ Custom logging format and handler can be set manually.
61
+ """
62
+
63
+ if not isinstance(sequences, (bytearray, bytes)):
64
+ raise TypeError(
65
+ "Expected object of type bytes or bytearray, got: {}".format(
66
+ type(sequences)
67
+ )
68
+ )
69
+
70
+ if explain:
71
+ previous_logger_level: int = logger.level
72
+ logger.addHandler(explain_handler)
73
+ logger.setLevel(TRACE)
74
+
75
+ length: int = len(sequences)
76
+
77
+ if length == 0:
78
+ logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
79
+ if explain: # Defensive: ensure exit path clean handler
80
+ logger.removeHandler(explain_handler)
81
+ logger.setLevel(previous_logger_level or logging.WARNING)
82
+ return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
83
+
84
+ if cp_isolation is not None:
85
+ logger.log(
86
+ TRACE,
87
+ "cp_isolation is set. use this flag for debugging purpose. "
88
+ "limited list of encoding allowed : %s.",
89
+ ", ".join(cp_isolation),
90
+ )
91
+ cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
92
+ else:
93
+ cp_isolation = []
94
+
95
+ if cp_exclusion is not None:
96
+ logger.log(
97
+ TRACE,
98
+ "cp_exclusion is set. use this flag for debugging purpose. "
99
+ "limited list of encoding excluded : %s.",
100
+ ", ".join(cp_exclusion),
101
+ )
102
+ cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
103
+ else:
104
+ cp_exclusion = []
105
+
106
+ if length <= (chunk_size * steps):
107
+ logger.log(
108
+ TRACE,
109
+ "override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
110
+ steps,
111
+ chunk_size,
112
+ length,
113
+ )
114
+ steps = 1
115
+ chunk_size = length
116
+
117
+ if steps > 1 and length / steps < chunk_size:
118
+ chunk_size = int(length / steps)
119
+
120
+ is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
121
+ is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
122
+
123
+ if is_too_small_sequence:
124
+ logger.log(
125
+ TRACE,
126
+ "Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
127
+ length
128
+ ),
129
+ )
130
+ elif is_too_large_sequence:
131
+ logger.log(
132
+ TRACE,
133
+ "Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
134
+ length
135
+ ),
136
+ )
137
+
138
+ prioritized_encodings: list[str] = []
139
+
140
+ specified_encoding: str | None = (
141
+ any_specified_encoding(sequences) if preemptive_behaviour else None
142
+ )
143
+
144
+ if specified_encoding is not None:
145
+ prioritized_encodings.append(specified_encoding)
146
+ logger.log(
147
+ TRACE,
148
+ "Detected declarative mark in sequence. Priority +1 given for %s.",
149
+ specified_encoding,
150
+ )
151
+
152
+ tested: set[str] = set()
153
+ tested_but_hard_failure: list[str] = []
154
+ tested_but_soft_failure: list[str] = []
155
+
156
+ fallback_ascii: CharsetMatch | None = None
157
+ fallback_u8: CharsetMatch | None = None
158
+ fallback_specified: CharsetMatch | None = None
159
+
160
+ results: CharsetMatches = CharsetMatches()
161
+
162
+ early_stop_results: CharsetMatches = CharsetMatches()
163
+
164
+ sig_encoding, sig_payload = identify_sig_or_bom(sequences)
165
+
166
+ if sig_encoding is not None:
167
+ prioritized_encodings.append(sig_encoding)
168
+ logger.log(
169
+ TRACE,
170
+ "Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
171
+ len(sig_payload),
172
+ sig_encoding,
173
+ )
174
+
175
+ prioritized_encodings.append("ascii")
176
+
177
+ if "utf_8" not in prioritized_encodings:
178
+ prioritized_encodings.append("utf_8")
179
+
180
+ for encoding_iana in prioritized_encodings + IANA_SUPPORTED:
181
+ if cp_isolation and encoding_iana not in cp_isolation:
182
+ continue
183
+
184
+ if cp_exclusion and encoding_iana in cp_exclusion:
185
+ continue
186
+
187
+ if encoding_iana in tested:
188
+ continue
189
+
190
+ tested.add(encoding_iana)
191
+
192
+ decoded_payload: str | None = None
193
+ bom_or_sig_available: bool = sig_encoding == encoding_iana
194
+ strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
195
+ encoding_iana
196
+ )
197
+
198
+ if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
199
+ logger.log(
200
+ TRACE,
201
+ "Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
202
+ encoding_iana,
203
+ )
204
+ continue
205
+ if encoding_iana in {"utf_7"} and not bom_or_sig_available:
206
+ logger.log(
207
+ TRACE,
208
+ "Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
209
+ encoding_iana,
210
+ )
211
+ continue
212
+
213
+ try:
214
+ is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
215
+ except (ModuleNotFoundError, ImportError):
216
+ logger.log(
217
+ TRACE,
218
+ "Encoding %s does not provide an IncrementalDecoder",
219
+ encoding_iana,
220
+ )
221
+ continue
222
+
223
+ try:
224
+ if is_too_large_sequence and is_multi_byte_decoder is False:
225
+ str(
226
+ (
227
+ sequences[: int(50e4)]
228
+ if strip_sig_or_bom is False
229
+ else sequences[len(sig_payload) : int(50e4)]
230
+ ),
231
+ encoding=encoding_iana,
232
+ )
233
+ else:
234
+ decoded_payload = str(
235
+ (
236
+ sequences
237
+ if strip_sig_or_bom is False
238
+ else sequences[len(sig_payload) :]
239
+ ),
240
+ encoding=encoding_iana,
241
+ )
242
+ except (UnicodeDecodeError, LookupError) as e:
243
+ if not isinstance(e, LookupError):
244
+ logger.log(
245
+ TRACE,
246
+ "Code page %s does not fit given bytes sequence at ALL. %s",
247
+ encoding_iana,
248
+ str(e),
249
+ )
250
+ tested_but_hard_failure.append(encoding_iana)
251
+ continue
252
+
253
+ similar_soft_failure_test: bool = False
254
+
255
+ for encoding_soft_failed in tested_but_soft_failure:
256
+ if is_cp_similar(encoding_iana, encoding_soft_failed):
257
+ similar_soft_failure_test = True
258
+ break
259
+
260
+ if similar_soft_failure_test:
261
+ logger.log(
262
+ TRACE,
263
+ "%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
264
+ encoding_iana,
265
+ encoding_soft_failed,
266
+ )
267
+ continue
268
+
269
+ r_ = range(
270
+ 0 if not bom_or_sig_available else len(sig_payload),
271
+ length,
272
+ int(length / steps),
273
+ )
274
+
275
+ multi_byte_bonus: bool = (
276
+ is_multi_byte_decoder
277
+ and decoded_payload is not None
278
+ and len(decoded_payload) < length
279
+ )
280
+
281
+ if multi_byte_bonus:
282
+ logger.log(
283
+ TRACE,
284
+ "Code page %s is a multi byte encoding table and it appear that at least one character "
285
+ "was encoded using n-bytes.",
286
+ encoding_iana,
287
+ )
288
+
289
+ max_chunk_gave_up: int = int(len(r_) / 4)
290
+
291
+ max_chunk_gave_up = max(max_chunk_gave_up, 2)
292
+ early_stop_count: int = 0
293
+ lazy_str_hard_failure = False
294
+
295
+ md_chunks: list[str] = []
296
+ md_ratios = []
297
+
298
+ try:
299
+ for chunk in cut_sequence_chunks(
300
+ sequences,
301
+ encoding_iana,
302
+ r_,
303
+ chunk_size,
304
+ bom_or_sig_available,
305
+ strip_sig_or_bom,
306
+ sig_payload,
307
+ is_multi_byte_decoder,
308
+ decoded_payload,
309
+ ):
310
+ md_chunks.append(chunk)
311
+
312
+ md_ratios.append(
313
+ mess_ratio(
314
+ chunk,
315
+ threshold,
316
+ explain is True and 1 <= len(cp_isolation) <= 2,
317
+ )
318
+ )
319
+
320
+ if md_ratios[-1] >= threshold:
321
+ early_stop_count += 1
322
+
323
+ if (early_stop_count >= max_chunk_gave_up) or (
324
+ bom_or_sig_available and strip_sig_or_bom is False
325
+ ):
326
+ break
327
+ except (
328
+ UnicodeDecodeError
329
+ ) as e: # Lazy str loading may have missed something there
330
+ logger.log(
331
+ TRACE,
332
+ "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
333
+ encoding_iana,
334
+ str(e),
335
+ )
336
+ early_stop_count = max_chunk_gave_up
337
+ lazy_str_hard_failure = True
338
+
339
+ # We might want to check the sequence again with the whole content
340
+ # Only if initial MD tests passes
341
+ if (
342
+ not lazy_str_hard_failure
343
+ and is_too_large_sequence
344
+ and not is_multi_byte_decoder
345
+ ):
346
+ try:
347
+ sequences[int(50e3) :].decode(encoding_iana, errors="strict")
348
+ except UnicodeDecodeError as e:
349
+ logger.log(
350
+ TRACE,
351
+ "LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
352
+ encoding_iana,
353
+ str(e),
354
+ )
355
+ tested_but_hard_failure.append(encoding_iana)
356
+ continue
357
+
358
+ mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
359
+ if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
360
+ tested_but_soft_failure.append(encoding_iana)
361
+ logger.log(
362
+ TRACE,
363
+ "%s was excluded because of initial chaos probing. Gave up %i time(s). "
364
+ "Computed mean chaos is %f %%.",
365
+ encoding_iana,
366
+ early_stop_count,
367
+ round(mean_mess_ratio * 100, ndigits=3),
368
+ )
369
+ # Preparing those fallbacks in case we got nothing.
370
+ if (
371
+ enable_fallback
372
+ and encoding_iana in ["ascii", "utf_8", specified_encoding]
373
+ and not lazy_str_hard_failure
374
+ ):
375
+ fallback_entry = CharsetMatch(
376
+ sequences,
377
+ encoding_iana,
378
+ threshold,
379
+ False,
380
+ [],
381
+ decoded_payload,
382
+ preemptive_declaration=specified_encoding,
383
+ )
384
+ if encoding_iana == specified_encoding:
385
+ fallback_specified = fallback_entry
386
+ elif encoding_iana == "ascii":
387
+ fallback_ascii = fallback_entry
388
+ else:
389
+ fallback_u8 = fallback_entry
390
+ continue
391
+
392
+ logger.log(
393
+ TRACE,
394
+ "%s passed initial chaos probing. Mean measured chaos is %f %%",
395
+ encoding_iana,
396
+ round(mean_mess_ratio * 100, ndigits=3),
397
+ )
398
+
399
+ if not is_multi_byte_decoder:
400
+ target_languages: list[str] = encoding_languages(encoding_iana)
401
+ else:
402
+ target_languages = mb_encoding_languages(encoding_iana)
403
+
404
+ if target_languages:
405
+ logger.log(
406
+ TRACE,
407
+ "{} should target any language(s) of {}".format(
408
+ encoding_iana, str(target_languages)
409
+ ),
410
+ )
411
+
412
+ cd_ratios = []
413
+
414
+ # We shall skip the CD when its about ASCII
415
+ # Most of the time its not relevant to run "language-detection" on it.
416
+ if encoding_iana != "ascii":
417
+ for chunk in md_chunks:
418
+ chunk_languages = coherence_ratio(
419
+ chunk,
420
+ language_threshold,
421
+ ",".join(target_languages) if target_languages else None,
422
+ )
423
+
424
+ cd_ratios.append(chunk_languages)
425
+
426
+ cd_ratios_merged = merge_coherence_ratios(cd_ratios)
427
+
428
+ if cd_ratios_merged:
429
+ logger.log(
430
+ TRACE,
431
+ "We detected language {} using {}".format(
432
+ cd_ratios_merged, encoding_iana
433
+ ),
434
+ )
435
+
436
+ current_match = CharsetMatch(
437
+ sequences,
438
+ encoding_iana,
439
+ mean_mess_ratio,
440
+ bom_or_sig_available,
441
+ cd_ratios_merged,
442
+ (
443
+ decoded_payload
444
+ if (
445
+ is_too_large_sequence is False
446
+ or encoding_iana in [specified_encoding, "ascii", "utf_8"]
447
+ )
448
+ else None
449
+ ),
450
+ preemptive_declaration=specified_encoding,
451
+ )
452
+
453
+ results.append(current_match)
454
+
455
+ if (
456
+ encoding_iana in [specified_encoding, "ascii", "utf_8"]
457
+ and mean_mess_ratio < 0.1
458
+ ):
459
+ # If md says nothing to worry about, then... stop immediately!
460
+ if mean_mess_ratio == 0.0:
461
+ logger.debug(
462
+ "Encoding detection: %s is most likely the one.",
463
+ current_match.encoding,
464
+ )
465
+ if explain: # Defensive: ensure exit path clean handler
466
+ logger.removeHandler(explain_handler)
467
+ logger.setLevel(previous_logger_level)
468
+ return CharsetMatches([current_match])
469
+
470
+ early_stop_results.append(current_match)
471
+
472
+ if (
473
+ len(early_stop_results)
474
+ and (specified_encoding is None or specified_encoding in tested)
475
+ and "ascii" in tested
476
+ and "utf_8" in tested
477
+ ):
478
+ probable_result: CharsetMatch = early_stop_results.best() # type: ignore[assignment]
479
+ logger.debug(
480
+ "Encoding detection: %s is most likely the one.",
481
+ probable_result.encoding,
482
+ )
483
+ if explain: # Defensive: ensure exit path clean handler
484
+ logger.removeHandler(explain_handler)
485
+ logger.setLevel(previous_logger_level)
486
+
487
+ return CharsetMatches([probable_result])
488
+
489
+ if encoding_iana == sig_encoding:
490
+ logger.debug(
491
+ "Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
492
+ "the beginning of the sequence.",
493
+ encoding_iana,
494
+ )
495
+ if explain: # Defensive: ensure exit path clean handler
496
+ logger.removeHandler(explain_handler)
497
+ logger.setLevel(previous_logger_level)
498
+ return CharsetMatches([results[encoding_iana]])
499
+
500
+ if len(results) == 0:
501
+ if fallback_u8 or fallback_ascii or fallback_specified:
502
+ logger.log(
503
+ TRACE,
504
+ "Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
505
+ )
506
+
507
+ if fallback_specified:
508
+ logger.debug(
509
+ "Encoding detection: %s will be used as a fallback match",
510
+ fallback_specified.encoding,
511
+ )
512
+ results.append(fallback_specified)
513
+ elif (
514
+ (fallback_u8 and fallback_ascii is None)
515
+ or (
516
+ fallback_u8
517
+ and fallback_ascii
518
+ and fallback_u8.fingerprint != fallback_ascii.fingerprint
519
+ )
520
+ or (fallback_u8 is not None)
521
+ ):
522
+ logger.debug("Encoding detection: utf_8 will be used as a fallback match")
523
+ results.append(fallback_u8)
524
+ elif fallback_ascii:
525
+ logger.debug("Encoding detection: ascii will be used as a fallback match")
526
+ results.append(fallback_ascii)
527
+
528
+ if results:
529
+ logger.debug(
530
+ "Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
531
+ results.best().encoding, # type: ignore
532
+ len(results) - 1,
533
+ )
534
+ else:
535
+ logger.debug("Encoding detection: Unable to determine any suitable charset.")
536
+
537
+ if explain:
538
+ logger.removeHandler(explain_handler)
539
+ logger.setLevel(previous_logger_level)
540
+
541
+ return results
542
+
543
+
544
+ def from_fp(
545
+ fp: BinaryIO,
546
+ steps: int = 5,
547
+ chunk_size: int = 512,
548
+ threshold: float = 0.20,
549
+ cp_isolation: list[str] | None = None,
550
+ cp_exclusion: list[str] | None = None,
551
+ preemptive_behaviour: bool = True,
552
+ explain: bool = False,
553
+ language_threshold: float = 0.1,
554
+ enable_fallback: bool = True,
555
+ ) -> CharsetMatches:
556
+ """
557
+ Same thing than the function from_bytes but using a file pointer that is already ready.
558
+ Will not close the file pointer.
559
+ """
560
+ return from_bytes(
561
+ fp.read(),
562
+ steps,
563
+ chunk_size,
564
+ threshold,
565
+ cp_isolation,
566
+ cp_exclusion,
567
+ preemptive_behaviour,
568
+ explain,
569
+ language_threshold,
570
+ enable_fallback,
571
+ )
572
+
573
+
574
+ def from_path(
575
+ path: str | bytes | PathLike, # type: ignore[type-arg]
576
+ steps: int = 5,
577
+ chunk_size: int = 512,
578
+ threshold: float = 0.20,
579
+ cp_isolation: list[str] | None = None,
580
+ cp_exclusion: list[str] | None = None,
581
+ preemptive_behaviour: bool = True,
582
+ explain: bool = False,
583
+ language_threshold: float = 0.1,
584
+ enable_fallback: bool = True,
585
+ ) -> CharsetMatches:
586
+ """
587
+ Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
588
+ Can raise IOError.
589
+ """
590
+ with open(path, "rb") as fp:
591
+ return from_fp(
592
+ fp,
593
+ steps,
594
+ chunk_size,
595
+ threshold,
596
+ cp_isolation,
597
+ cp_exclusion,
598
+ preemptive_behaviour,
599
+ explain,
600
+ language_threshold,
601
+ enable_fallback,
602
+ )
603
+
604
+
605
+ def is_binary(
606
+ fp_or_path_or_payload: PathLike | str | BinaryIO | bytes, # type: ignore[type-arg]
607
+ steps: int = 5,
608
+ chunk_size: int = 512,
609
+ threshold: float = 0.20,
610
+ cp_isolation: list[str] | None = None,
611
+ cp_exclusion: list[str] | None = None,
612
+ preemptive_behaviour: bool = True,
613
+ explain: bool = False,
614
+ language_threshold: float = 0.1,
615
+ enable_fallback: bool = False,
616
+ ) -> bool:
617
+ """
618
+ Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
619
+ Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
620
+ are disabled to be stricter around ASCII-compatible but unlikely to be a string.
621
+ """
622
+ if isinstance(fp_or_path_or_payload, (str, PathLike)):
623
+ guesses = from_path(
624
+ fp_or_path_or_payload,
625
+ steps=steps,
626
+ chunk_size=chunk_size,
627
+ threshold=threshold,
628
+ cp_isolation=cp_isolation,
629
+ cp_exclusion=cp_exclusion,
630
+ preemptive_behaviour=preemptive_behaviour,
631
+ explain=explain,
632
+ language_threshold=language_threshold,
633
+ enable_fallback=enable_fallback,
634
+ )
635
+ elif isinstance(
636
+ fp_or_path_or_payload,
637
+ (
638
+ bytes,
639
+ bytearray,
640
+ ),
641
+ ):
642
+ guesses = from_bytes(
643
+ fp_or_path_or_payload,
644
+ steps=steps,
645
+ chunk_size=chunk_size,
646
+ threshold=threshold,
647
+ cp_isolation=cp_isolation,
648
+ cp_exclusion=cp_exclusion,
649
+ preemptive_behaviour=preemptive_behaviour,
650
+ explain=explain,
651
+ language_threshold=language_threshold,
652
+ enable_fallback=enable_fallback,
653
+ )
654
+ else:
655
+ guesses = from_fp(
656
+ fp_or_path_or_payload,
657
+ steps=steps,
658
+ chunk_size=chunk_size,
659
+ threshold=threshold,
660
+ cp_isolation=cp_isolation,
661
+ cp_exclusion=cp_exclusion,
662
+ preemptive_behaviour=preemptive_behaviour,
663
+ explain=explain,
664
+ language_threshold=language_threshold,
665
+ enable_fallback=enable_fallback,
666
+ )
667
+
668
+ return not guesses
.venv/lib/python3.13/site-packages/charset_normalizer/cd.py ADDED
@@ -0,0 +1,395 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ from codecs import IncrementalDecoder
5
+ from collections import Counter
6
+ from functools import lru_cache
7
+ from typing import Counter as TypeCounter
8
+
9
+ from .constant import (
10
+ FREQUENCIES,
11
+ KO_NAMES,
12
+ LANGUAGE_SUPPORTED_COUNT,
13
+ TOO_SMALL_SEQUENCE,
14
+ ZH_NAMES,
15
+ )
16
+ from .md import is_suspiciously_successive_range
17
+ from .models import CoherenceMatches
18
+ from .utils import (
19
+ is_accentuated,
20
+ is_latin,
21
+ is_multi_byte_encoding,
22
+ is_unicode_range_secondary,
23
+ unicode_range,
24
+ )
25
+
26
+
27
+ def encoding_unicode_range(iana_name: str) -> list[str]:
28
+ """
29
+ Return associated unicode ranges in a single byte code page.
30
+ """
31
+ if is_multi_byte_encoding(iana_name):
32
+ raise OSError("Function not supported on multi-byte code page")
33
+
34
+ decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder
35
+
36
+ p: IncrementalDecoder = decoder(errors="ignore")
37
+ seen_ranges: dict[str, int] = {}
38
+ character_count: int = 0
39
+
40
+ for i in range(0x40, 0xFF):
41
+ chunk: str = p.decode(bytes([i]))
42
+
43
+ if chunk:
44
+ character_range: str | None = unicode_range(chunk)
45
+
46
+ if character_range is None:
47
+ continue
48
+
49
+ if is_unicode_range_secondary(character_range) is False:
50
+ if character_range not in seen_ranges:
51
+ seen_ranges[character_range] = 0
52
+ seen_ranges[character_range] += 1
53
+ character_count += 1
54
+
55
+ return sorted(
56
+ [
57
+ character_range
58
+ for character_range in seen_ranges
59
+ if seen_ranges[character_range] / character_count >= 0.15
60
+ ]
61
+ )
62
+
63
+
64
+ def unicode_range_languages(primary_range: str) -> list[str]:
65
+ """
66
+ Return inferred languages used with a unicode range.
67
+ """
68
+ languages: list[str] = []
69
+
70
+ for language, characters in FREQUENCIES.items():
71
+ for character in characters:
72
+ if unicode_range(character) == primary_range:
73
+ languages.append(language)
74
+ break
75
+
76
+ return languages
77
+
78
+
79
+ @lru_cache()
80
+ def encoding_languages(iana_name: str) -> list[str]:
81
+ """
82
+ Single-byte encoding language association. Some code page are heavily linked to particular language(s).
83
+ This function does the correspondence.
84
+ """
85
+ unicode_ranges: list[str] = encoding_unicode_range(iana_name)
86
+ primary_range: str | None = None
87
+
88
+ for specified_range in unicode_ranges:
89
+ if "Latin" not in specified_range:
90
+ primary_range = specified_range
91
+ break
92
+
93
+ if primary_range is None:
94
+ return ["Latin Based"]
95
+
96
+ return unicode_range_languages(primary_range)
97
+
98
+
99
+ @lru_cache()
100
+ def mb_encoding_languages(iana_name: str) -> list[str]:
101
+ """
102
+ Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
103
+ This function does the correspondence.
104
+ """
105
+ if (
106
+ iana_name.startswith("shift_")
107
+ or iana_name.startswith("iso2022_jp")
108
+ or iana_name.startswith("euc_j")
109
+ or iana_name == "cp932"
110
+ ):
111
+ return ["Japanese"]
112
+ if iana_name.startswith("gb") or iana_name in ZH_NAMES:
113
+ return ["Chinese"]
114
+ if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
115
+ return ["Korean"]
116
+
117
+ return []
118
+
119
+
120
+ @lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
121
+ def get_target_features(language: str) -> tuple[bool, bool]:
122
+ """
123
+ Determine main aspects from a supported language if it contains accents and if is pure Latin.
124
+ """
125
+ target_have_accents: bool = False
126
+ target_pure_latin: bool = True
127
+
128
+ for character in FREQUENCIES[language]:
129
+ if not target_have_accents and is_accentuated(character):
130
+ target_have_accents = True
131
+ if target_pure_latin and is_latin(character) is False:
132
+ target_pure_latin = False
133
+
134
+ return target_have_accents, target_pure_latin
135
+
136
+
137
+ def alphabet_languages(
138
+ characters: list[str], ignore_non_latin: bool = False
139
+ ) -> list[str]:
140
+ """
141
+ Return associated languages associated to given characters.
142
+ """
143
+ languages: list[tuple[str, float]] = []
144
+
145
+ source_have_accents = any(is_accentuated(character) for character in characters)
146
+
147
+ for language, language_characters in FREQUENCIES.items():
148
+ target_have_accents, target_pure_latin = get_target_features(language)
149
+
150
+ if ignore_non_latin and target_pure_latin is False:
151
+ continue
152
+
153
+ if target_have_accents is False and source_have_accents:
154
+ continue
155
+
156
+ character_count: int = len(language_characters)
157
+
158
+ character_match_count: int = len(
159
+ [c for c in language_characters if c in characters]
160
+ )
161
+
162
+ ratio: float = character_match_count / character_count
163
+
164
+ if ratio >= 0.2:
165
+ languages.append((language, ratio))
166
+
167
+ languages = sorted(languages, key=lambda x: x[1], reverse=True)
168
+
169
+ return [compatible_language[0] for compatible_language in languages]
170
+
171
+
172
+ def characters_popularity_compare(
173
+ language: str, ordered_characters: list[str]
174
+ ) -> float:
175
+ """
176
+ Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
177
+ The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
178
+ Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
179
+ """
180
+ if language not in FREQUENCIES:
181
+ raise ValueError(f"{language} not available")
182
+
183
+ character_approved_count: int = 0
184
+ FREQUENCIES_language_set = set(FREQUENCIES[language])
185
+
186
+ ordered_characters_count: int = len(ordered_characters)
187
+ target_language_characters_count: int = len(FREQUENCIES[language])
188
+
189
+ large_alphabet: bool = target_language_characters_count > 26
190
+
191
+ for character, character_rank in zip(
192
+ ordered_characters, range(0, ordered_characters_count)
193
+ ):
194
+ if character not in FREQUENCIES_language_set:
195
+ continue
196
+
197
+ character_rank_in_language: int = FREQUENCIES[language].index(character)
198
+ expected_projection_ratio: float = (
199
+ target_language_characters_count / ordered_characters_count
200
+ )
201
+ character_rank_projection: int = int(character_rank * expected_projection_ratio)
202
+
203
+ if (
204
+ large_alphabet is False
205
+ and abs(character_rank_projection - character_rank_in_language) > 4
206
+ ):
207
+ continue
208
+
209
+ if (
210
+ large_alphabet is True
211
+ and abs(character_rank_projection - character_rank_in_language)
212
+ < target_language_characters_count / 3
213
+ ):
214
+ character_approved_count += 1
215
+ continue
216
+
217
+ characters_before_source: list[str] = FREQUENCIES[language][
218
+ 0:character_rank_in_language
219
+ ]
220
+ characters_after_source: list[str] = FREQUENCIES[language][
221
+ character_rank_in_language:
222
+ ]
223
+ characters_before: list[str] = ordered_characters[0:character_rank]
224
+ characters_after: list[str] = ordered_characters[character_rank:]
225
+
226
+ before_match_count: int = len(
227
+ set(characters_before) & set(characters_before_source)
228
+ )
229
+
230
+ after_match_count: int = len(
231
+ set(characters_after) & set(characters_after_source)
232
+ )
233
+
234
+ if len(characters_before_source) == 0 and before_match_count <= 4:
235
+ character_approved_count += 1
236
+ continue
237
+
238
+ if len(characters_after_source) == 0 and after_match_count <= 4:
239
+ character_approved_count += 1
240
+ continue
241
+
242
+ if (
243
+ before_match_count / len(characters_before_source) >= 0.4
244
+ or after_match_count / len(characters_after_source) >= 0.4
245
+ ):
246
+ character_approved_count += 1
247
+ continue
248
+
249
+ return character_approved_count / len(ordered_characters)
250
+
251
+
252
+ def alpha_unicode_split(decoded_sequence: str) -> list[str]:
253
+ """
254
+ Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
255
+ Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
256
+ One containing the latin letters and the other hebrew.
257
+ """
258
+ layers: dict[str, str] = {}
259
+
260
+ for character in decoded_sequence:
261
+ if character.isalpha() is False:
262
+ continue
263
+
264
+ character_range: str | None = unicode_range(character)
265
+
266
+ if character_range is None:
267
+ continue
268
+
269
+ layer_target_range: str | None = None
270
+
271
+ for discovered_range in layers:
272
+ if (
273
+ is_suspiciously_successive_range(discovered_range, character_range)
274
+ is False
275
+ ):
276
+ layer_target_range = discovered_range
277
+ break
278
+
279
+ if layer_target_range is None:
280
+ layer_target_range = character_range
281
+
282
+ if layer_target_range not in layers:
283
+ layers[layer_target_range] = character.lower()
284
+ continue
285
+
286
+ layers[layer_target_range] += character.lower()
287
+
288
+ return list(layers.values())
289
+
290
+
291
+ def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches:
292
+ """
293
+ This function merge results previously given by the function coherence_ratio.
294
+ The return type is the same as coherence_ratio.
295
+ """
296
+ per_language_ratios: dict[str, list[float]] = {}
297
+ for result in results:
298
+ for sub_result in result:
299
+ language, ratio = sub_result
300
+ if language not in per_language_ratios:
301
+ per_language_ratios[language] = [ratio]
302
+ continue
303
+ per_language_ratios[language].append(ratio)
304
+
305
+ merge = [
306
+ (
307
+ language,
308
+ round(
309
+ sum(per_language_ratios[language]) / len(per_language_ratios[language]),
310
+ 4,
311
+ ),
312
+ )
313
+ for language in per_language_ratios
314
+ ]
315
+
316
+ return sorted(merge, key=lambda x: x[1], reverse=True)
317
+
318
+
319
+ def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
320
+ """
321
+ We shall NOT return "English—" in CoherenceMatches because it is an alternative
322
+ of "English". This function only keeps the best match and remove the em-dash in it.
323
+ """
324
+ index_results: dict[str, list[float]] = dict()
325
+
326
+ for result in results:
327
+ language, ratio = result
328
+ no_em_name: str = language.replace("—", "")
329
+
330
+ if no_em_name not in index_results:
331
+ index_results[no_em_name] = []
332
+
333
+ index_results[no_em_name].append(ratio)
334
+
335
+ if any(len(index_results[e]) > 1 for e in index_results):
336
+ filtered_results: CoherenceMatches = []
337
+
338
+ for language in index_results:
339
+ filtered_results.append((language, max(index_results[language])))
340
+
341
+ return filtered_results
342
+
343
+ return results
344
+
345
+
346
+ @lru_cache(maxsize=2048)
347
+ def coherence_ratio(
348
+ decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None
349
+ ) -> CoherenceMatches:
350
+ """
351
+ Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
352
+ A layer = Character extraction by alphabets/ranges.
353
+ """
354
+
355
+ results: list[tuple[str, float]] = []
356
+ ignore_non_latin: bool = False
357
+
358
+ sufficient_match_count: int = 0
359
+
360
+ lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
361
+ if "Latin Based" in lg_inclusion_list:
362
+ ignore_non_latin = True
363
+ lg_inclusion_list.remove("Latin Based")
364
+
365
+ for layer in alpha_unicode_split(decoded_sequence):
366
+ sequence_frequencies: TypeCounter[str] = Counter(layer)
367
+ most_common = sequence_frequencies.most_common()
368
+
369
+ character_count: int = sum(o for c, o in most_common)
370
+
371
+ if character_count <= TOO_SMALL_SEQUENCE:
372
+ continue
373
+
374
+ popular_character_ordered: list[str] = [c for c, o in most_common]
375
+
376
+ for language in lg_inclusion_list or alphabet_languages(
377
+ popular_character_ordered, ignore_non_latin
378
+ ):
379
+ ratio: float = characters_popularity_compare(
380
+ language, popular_character_ordered
381
+ )
382
+
383
+ if ratio < threshold:
384
+ continue
385
+ elif ratio >= 0.8:
386
+ sufficient_match_count += 1
387
+
388
+ results.append((language, round(ratio, 4)))
389
+
390
+ if sufficient_match_count >= 3:
391
+ break
392
+
393
+ return sorted(
394
+ filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
395
+ )
.venv/lib/python3.13/site-packages/charset_normalizer/cli/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from .__main__ import cli_detect, query_yes_no
4
+
5
+ __all__ = (
6
+ "cli_detect",
7
+ "query_yes_no",
8
+ )
.venv/lib/python3.13/site-packages/charset_normalizer/cli/__main__.py ADDED
@@ -0,0 +1,321 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import sys
5
+ from json import dumps
6
+ from os.path import abspath, basename, dirname, join, realpath
7
+ from platform import python_version
8
+ from unicodedata import unidata_version
9
+
10
+ import charset_normalizer.md as md_module
11
+ from charset_normalizer import from_fp
12
+ from charset_normalizer.models import CliDetectionResult
13
+ from charset_normalizer.version import __version__
14
+
15
+
16
+ def query_yes_no(question: str, default: str = "yes") -> bool:
17
+ """Ask a yes/no question via input() and return their answer.
18
+
19
+ "question" is a string that is presented to the user.
20
+ "default" is the presumed answer if the user just hits <Enter>.
21
+ It must be "yes" (the default), "no" or None (meaning
22
+ an answer is required of the user).
23
+
24
+ The "answer" return value is True for "yes" or False for "no".
25
+
26
+ Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
27
+ """
28
+ valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
29
+ if default is None:
30
+ prompt = " [y/n] "
31
+ elif default == "yes":
32
+ prompt = " [Y/n] "
33
+ elif default == "no":
34
+ prompt = " [y/N] "
35
+ else:
36
+ raise ValueError("invalid default answer: '%s'" % default)
37
+
38
+ while True:
39
+ sys.stdout.write(question + prompt)
40
+ choice = input().lower()
41
+ if default is not None and choice == "":
42
+ return valid[default]
43
+ elif choice in valid:
44
+ return valid[choice]
45
+ else:
46
+ sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
47
+
48
+
49
+ def cli_detect(argv: list[str] | None = None) -> int:
50
+ """
51
+ CLI assistant using ARGV and ArgumentParser
52
+ :param argv:
53
+ :return: 0 if everything is fine, anything else equal trouble
54
+ """
55
+ parser = argparse.ArgumentParser(
56
+ description="The Real First Universal Charset Detector. "
57
+ "Discover originating encoding used on text file. "
58
+ "Normalize text to unicode."
59
+ )
60
+
61
+ parser.add_argument(
62
+ "files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed"
63
+ )
64
+ parser.add_argument(
65
+ "-v",
66
+ "--verbose",
67
+ action="store_true",
68
+ default=False,
69
+ dest="verbose",
70
+ help="Display complementary information about file if any. "
71
+ "Stdout will contain logs about the detection process.",
72
+ )
73
+ parser.add_argument(
74
+ "-a",
75
+ "--with-alternative",
76
+ action="store_true",
77
+ default=False,
78
+ dest="alternatives",
79
+ help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
80
+ )
81
+ parser.add_argument(
82
+ "-n",
83
+ "--normalize",
84
+ action="store_true",
85
+ default=False,
86
+ dest="normalize",
87
+ help="Permit to normalize input file. If not set, program does not write anything.",
88
+ )
89
+ parser.add_argument(
90
+ "-m",
91
+ "--minimal",
92
+ action="store_true",
93
+ default=False,
94
+ dest="minimal",
95
+ help="Only output the charset detected to STDOUT. Disabling JSON output.",
96
+ )
97
+ parser.add_argument(
98
+ "-r",
99
+ "--replace",
100
+ action="store_true",
101
+ default=False,
102
+ dest="replace",
103
+ help="Replace file when trying to normalize it instead of creating a new one.",
104
+ )
105
+ parser.add_argument(
106
+ "-f",
107
+ "--force",
108
+ action="store_true",
109
+ default=False,
110
+ dest="force",
111
+ help="Replace file without asking if you are sure, use this flag with caution.",
112
+ )
113
+ parser.add_argument(
114
+ "-i",
115
+ "--no-preemptive",
116
+ action="store_true",
117
+ default=False,
118
+ dest="no_preemptive",
119
+ help="Disable looking at a charset declaration to hint the detector.",
120
+ )
121
+ parser.add_argument(
122
+ "-t",
123
+ "--threshold",
124
+ action="store",
125
+ default=0.2,
126
+ type=float,
127
+ dest="threshold",
128
+ help="Define a custom maximum amount of noise allowed in decoded content. 0. <= noise <= 1.",
129
+ )
130
+ parser.add_argument(
131
+ "--version",
132
+ action="version",
133
+ version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
134
+ __version__,
135
+ python_version(),
136
+ unidata_version,
137
+ "OFF" if md_module.__file__.lower().endswith(".py") else "ON",
138
+ ),
139
+ help="Show version information and exit.",
140
+ )
141
+
142
+ args = parser.parse_args(argv)
143
+
144
+ if args.replace is True and args.normalize is False:
145
+ if args.files:
146
+ for my_file in args.files:
147
+ my_file.close()
148
+ print("Use --replace in addition of --normalize only.", file=sys.stderr)
149
+ return 1
150
+
151
+ if args.force is True and args.replace is False:
152
+ if args.files:
153
+ for my_file in args.files:
154
+ my_file.close()
155
+ print("Use --force in addition of --replace only.", file=sys.stderr)
156
+ return 1
157
+
158
+ if args.threshold < 0.0 or args.threshold > 1.0:
159
+ if args.files:
160
+ for my_file in args.files:
161
+ my_file.close()
162
+ print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
163
+ return 1
164
+
165
+ x_ = []
166
+
167
+ for my_file in args.files:
168
+ matches = from_fp(
169
+ my_file,
170
+ threshold=args.threshold,
171
+ explain=args.verbose,
172
+ preemptive_behaviour=args.no_preemptive is False,
173
+ )
174
+
175
+ best_guess = matches.best()
176
+
177
+ if best_guess is None:
178
+ print(
179
+ 'Unable to identify originating encoding for "{}". {}'.format(
180
+ my_file.name,
181
+ (
182
+ "Maybe try increasing maximum amount of chaos."
183
+ if args.threshold < 1.0
184
+ else ""
185
+ ),
186
+ ),
187
+ file=sys.stderr,
188
+ )
189
+ x_.append(
190
+ CliDetectionResult(
191
+ abspath(my_file.name),
192
+ None,
193
+ [],
194
+ [],
195
+ "Unknown",
196
+ [],
197
+ False,
198
+ 1.0,
199
+ 0.0,
200
+ None,
201
+ True,
202
+ )
203
+ )
204
+ else:
205
+ x_.append(
206
+ CliDetectionResult(
207
+ abspath(my_file.name),
208
+ best_guess.encoding,
209
+ best_guess.encoding_aliases,
210
+ [
211
+ cp
212
+ for cp in best_guess.could_be_from_charset
213
+ if cp != best_guess.encoding
214
+ ],
215
+ best_guess.language,
216
+ best_guess.alphabets,
217
+ best_guess.bom,
218
+ best_guess.percent_chaos,
219
+ best_guess.percent_coherence,
220
+ None,
221
+ True,
222
+ )
223
+ )
224
+
225
+ if len(matches) > 1 and args.alternatives:
226
+ for el in matches:
227
+ if el != best_guess:
228
+ x_.append(
229
+ CliDetectionResult(
230
+ abspath(my_file.name),
231
+ el.encoding,
232
+ el.encoding_aliases,
233
+ [
234
+ cp
235
+ for cp in el.could_be_from_charset
236
+ if cp != el.encoding
237
+ ],
238
+ el.language,
239
+ el.alphabets,
240
+ el.bom,
241
+ el.percent_chaos,
242
+ el.percent_coherence,
243
+ None,
244
+ False,
245
+ )
246
+ )
247
+
248
+ if args.normalize is True:
249
+ if best_guess.encoding.startswith("utf") is True:
250
+ print(
251
+ '"{}" file does not need to be normalized, as it already came from unicode.'.format(
252
+ my_file.name
253
+ ),
254
+ file=sys.stderr,
255
+ )
256
+ if my_file.closed is False:
257
+ my_file.close()
258
+ continue
259
+
260
+ dir_path = dirname(realpath(my_file.name))
261
+ file_name = basename(realpath(my_file.name))
262
+
263
+ o_: list[str] = file_name.split(".")
264
+
265
+ if args.replace is False:
266
+ o_.insert(-1, best_guess.encoding)
267
+ if my_file.closed is False:
268
+ my_file.close()
269
+ elif (
270
+ args.force is False
271
+ and query_yes_no(
272
+ 'Are you sure to normalize "{}" by replacing it ?'.format(
273
+ my_file.name
274
+ ),
275
+ "no",
276
+ )
277
+ is False
278
+ ):
279
+ if my_file.closed is False:
280
+ my_file.close()
281
+ continue
282
+
283
+ try:
284
+ x_[0].unicode_path = join(dir_path, ".".join(o_))
285
+
286
+ with open(x_[0].unicode_path, "wb") as fp:
287
+ fp.write(best_guess.output())
288
+ except OSError as e:
289
+ print(str(e), file=sys.stderr)
290
+ if my_file.closed is False:
291
+ my_file.close()
292
+ return 2
293
+
294
+ if my_file.closed is False:
295
+ my_file.close()
296
+
297
+ if args.minimal is False:
298
+ print(
299
+ dumps(
300
+ [el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
301
+ ensure_ascii=True,
302
+ indent=4,
303
+ )
304
+ )
305
+ else:
306
+ for my_file in args.files:
307
+ print(
308
+ ", ".join(
309
+ [
310
+ el.encoding or "undefined"
311
+ for el in x_
312
+ if el.path == abspath(my_file.name)
313
+ ]
314
+ )
315
+ )
316
+
317
+ return 0
318
+
319
+
320
+ if __name__ == "__main__":
321
+ cli_detect()
.venv/lib/python3.13/site-packages/charset_normalizer/constant.py ADDED
@@ -0,0 +1,1998 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
4
+ from encodings.aliases import aliases
5
+ from re import IGNORECASE
6
+ from re import compile as re_compile
7
+
8
+ # Contain for each eligible encoding a list of/item bytes SIG/BOM
9
+ ENCODING_MARKS: dict[str, bytes | list[bytes]] = {
10
+ "utf_8": BOM_UTF8,
11
+ "utf_7": [
12
+ b"\x2b\x2f\x76\x38",
13
+ b"\x2b\x2f\x76\x39",
14
+ b"\x2b\x2f\x76\x2b",
15
+ b"\x2b\x2f\x76\x2f",
16
+ b"\x2b\x2f\x76\x38\x2d",
17
+ ],
18
+ "gb18030": b"\x84\x31\x95\x33",
19
+ "utf_32": [BOM_UTF32_BE, BOM_UTF32_LE],
20
+ "utf_16": [BOM_UTF16_BE, BOM_UTF16_LE],
21
+ }
22
+
23
+ TOO_SMALL_SEQUENCE: int = 32
24
+ TOO_BIG_SEQUENCE: int = int(10e6)
25
+
26
+ UTF8_MAXIMAL_ALLOCATION: int = 1_112_064
27
+
28
+ # Up-to-date Unicode ucd/15.0.0
29
+ UNICODE_RANGES_COMBINED: dict[str, range] = {
30
+ "Control character": range(32),
31
+ "Basic Latin": range(32, 128),
32
+ "Latin-1 Supplement": range(128, 256),
33
+ "Latin Extended-A": range(256, 384),
34
+ "Latin Extended-B": range(384, 592),
35
+ "IPA Extensions": range(592, 688),
36
+ "Spacing Modifier Letters": range(688, 768),
37
+ "Combining Diacritical Marks": range(768, 880),
38
+ "Greek and Coptic": range(880, 1024),
39
+ "Cyrillic": range(1024, 1280),
40
+ "Cyrillic Supplement": range(1280, 1328),
41
+ "Armenian": range(1328, 1424),
42
+ "Hebrew": range(1424, 1536),
43
+ "Arabic": range(1536, 1792),
44
+ "Syriac": range(1792, 1872),
45
+ "Arabic Supplement": range(1872, 1920),
46
+ "Thaana": range(1920, 1984),
47
+ "NKo": range(1984, 2048),
48
+ "Samaritan": range(2048, 2112),
49
+ "Mandaic": range(2112, 2144),
50
+ "Syriac Supplement": range(2144, 2160),
51
+ "Arabic Extended-B": range(2160, 2208),
52
+ "Arabic Extended-A": range(2208, 2304),
53
+ "Devanagari": range(2304, 2432),
54
+ "Bengali": range(2432, 2560),
55
+ "Gurmukhi": range(2560, 2688),
56
+ "Gujarati": range(2688, 2816),
57
+ "Oriya": range(2816, 2944),
58
+ "Tamil": range(2944, 3072),
59
+ "Telugu": range(3072, 3200),
60
+ "Kannada": range(3200, 3328),
61
+ "Malayalam": range(3328, 3456),
62
+ "Sinhala": range(3456, 3584),
63
+ "Thai": range(3584, 3712),
64
+ "Lao": range(3712, 3840),
65
+ "Tibetan": range(3840, 4096),
66
+ "Myanmar": range(4096, 4256),
67
+ "Georgian": range(4256, 4352),
68
+ "Hangul Jamo": range(4352, 4608),
69
+ "Ethiopic": range(4608, 4992),
70
+ "Ethiopic Supplement": range(4992, 5024),
71
+ "Cherokee": range(5024, 5120),
72
+ "Unified Canadian Aboriginal Syllabics": range(5120, 5760),
73
+ "Ogham": range(5760, 5792),
74
+ "Runic": range(5792, 5888),
75
+ "Tagalog": range(5888, 5920),
76
+ "Hanunoo": range(5920, 5952),
77
+ "Buhid": range(5952, 5984),
78
+ "Tagbanwa": range(5984, 6016),
79
+ "Khmer": range(6016, 6144),
80
+ "Mongolian": range(6144, 6320),
81
+ "Unified Canadian Aboriginal Syllabics Extended": range(6320, 6400),
82
+ "Limbu": range(6400, 6480),
83
+ "Tai Le": range(6480, 6528),
84
+ "New Tai Lue": range(6528, 6624),
85
+ "Khmer Symbols": range(6624, 6656),
86
+ "Buginese": range(6656, 6688),
87
+ "Tai Tham": range(6688, 6832),
88
+ "Combining Diacritical Marks Extended": range(6832, 6912),
89
+ "Balinese": range(6912, 7040),
90
+ "Sundanese": range(7040, 7104),
91
+ "Batak": range(7104, 7168),
92
+ "Lepcha": range(7168, 7248),
93
+ "Ol Chiki": range(7248, 7296),
94
+ "Cyrillic Extended-C": range(7296, 7312),
95
+ "Georgian Extended": range(7312, 7360),
96
+ "Sundanese Supplement": range(7360, 7376),
97
+ "Vedic Extensions": range(7376, 7424),
98
+ "Phonetic Extensions": range(7424, 7552),
99
+ "Phonetic Extensions Supplement": range(7552, 7616),
100
+ "Combining Diacritical Marks Supplement": range(7616, 7680),
101
+ "Latin Extended Additional": range(7680, 7936),
102
+ "Greek Extended": range(7936, 8192),
103
+ "General Punctuation": range(8192, 8304),
104
+ "Superscripts and Subscripts": range(8304, 8352),
105
+ "Currency Symbols": range(8352, 8400),
106
+ "Combining Diacritical Marks for Symbols": range(8400, 8448),
107
+ "Letterlike Symbols": range(8448, 8528),
108
+ "Number Forms": range(8528, 8592),
109
+ "Arrows": range(8592, 8704),
110
+ "Mathematical Operators": range(8704, 8960),
111
+ "Miscellaneous Technical": range(8960, 9216),
112
+ "Control Pictures": range(9216, 9280),
113
+ "Optical Character Recognition": range(9280, 9312),
114
+ "Enclosed Alphanumerics": range(9312, 9472),
115
+ "Box Drawing": range(9472, 9600),
116
+ "Block Elements": range(9600, 9632),
117
+ "Geometric Shapes": range(9632, 9728),
118
+ "Miscellaneous Symbols": range(9728, 9984),
119
+ "Dingbats": range(9984, 10176),
120
+ "Miscellaneous Mathematical Symbols-A": range(10176, 10224),
121
+ "Supplemental Arrows-A": range(10224, 10240),
122
+ "Braille Patterns": range(10240, 10496),
123
+ "Supplemental Arrows-B": range(10496, 10624),
124
+ "Miscellaneous Mathematical Symbols-B": range(10624, 10752),
125
+ "Supplemental Mathematical Operators": range(10752, 11008),
126
+ "Miscellaneous Symbols and Arrows": range(11008, 11264),
127
+ "Glagolitic": range(11264, 11360),
128
+ "Latin Extended-C": range(11360, 11392),
129
+ "Coptic": range(11392, 11520),
130
+ "Georgian Supplement": range(11520, 11568),
131
+ "Tifinagh": range(11568, 11648),
132
+ "Ethiopic Extended": range(11648, 11744),
133
+ "Cyrillic Extended-A": range(11744, 11776),
134
+ "Supplemental Punctuation": range(11776, 11904),
135
+ "CJK Radicals Supplement": range(11904, 12032),
136
+ "Kangxi Radicals": range(12032, 12256),
137
+ "Ideographic Description Characters": range(12272, 12288),
138
+ "CJK Symbols and Punctuation": range(12288, 12352),
139
+ "Hiragana": range(12352, 12448),
140
+ "Katakana": range(12448, 12544),
141
+ "Bopomofo": range(12544, 12592),
142
+ "Hangul Compatibility Jamo": range(12592, 12688),
143
+ "Kanbun": range(12688, 12704),
144
+ "Bopomofo Extended": range(12704, 12736),
145
+ "CJK Strokes": range(12736, 12784),
146
+ "Katakana Phonetic Extensions": range(12784, 12800),
147
+ "Enclosed CJK Letters and Months": range(12800, 13056),
148
+ "CJK Compatibility": range(13056, 13312),
149
+ "CJK Unified Ideographs Extension A": range(13312, 19904),
150
+ "Yijing Hexagram Symbols": range(19904, 19968),
151
+ "CJK Unified Ideographs": range(19968, 40960),
152
+ "Yi Syllables": range(40960, 42128),
153
+ "Yi Radicals": range(42128, 42192),
154
+ "Lisu": range(42192, 42240),
155
+ "Vai": range(42240, 42560),
156
+ "Cyrillic Extended-B": range(42560, 42656),
157
+ "Bamum": range(42656, 42752),
158
+ "Modifier Tone Letters": range(42752, 42784),
159
+ "Latin Extended-D": range(42784, 43008),
160
+ "Syloti Nagri": range(43008, 43056),
161
+ "Common Indic Number Forms": range(43056, 43072),
162
+ "Phags-pa": range(43072, 43136),
163
+ "Saurashtra": range(43136, 43232),
164
+ "Devanagari Extended": range(43232, 43264),
165
+ "Kayah Li": range(43264, 43312),
166
+ "Rejang": range(43312, 43360),
167
+ "Hangul Jamo Extended-A": range(43360, 43392),
168
+ "Javanese": range(43392, 43488),
169
+ "Myanmar Extended-B": range(43488, 43520),
170
+ "Cham": range(43520, 43616),
171
+ "Myanmar Extended-A": range(43616, 43648),
172
+ "Tai Viet": range(43648, 43744),
173
+ "Meetei Mayek Extensions": range(43744, 43776),
174
+ "Ethiopic Extended-A": range(43776, 43824),
175
+ "Latin Extended-E": range(43824, 43888),
176
+ "Cherokee Supplement": range(43888, 43968),
177
+ "Meetei Mayek": range(43968, 44032),
178
+ "Hangul Syllables": range(44032, 55216),
179
+ "Hangul Jamo Extended-B": range(55216, 55296),
180
+ "High Surrogates": range(55296, 56192),
181
+ "High Private Use Surrogates": range(56192, 56320),
182
+ "Low Surrogates": range(56320, 57344),
183
+ "Private Use Area": range(57344, 63744),
184
+ "CJK Compatibility Ideographs": range(63744, 64256),
185
+ "Alphabetic Presentation Forms": range(64256, 64336),
186
+ "Arabic Presentation Forms-A": range(64336, 65024),
187
+ "Variation Selectors": range(65024, 65040),
188
+ "Vertical Forms": range(65040, 65056),
189
+ "Combining Half Marks": range(65056, 65072),
190
+ "CJK Compatibility Forms": range(65072, 65104),
191
+ "Small Form Variants": range(65104, 65136),
192
+ "Arabic Presentation Forms-B": range(65136, 65280),
193
+ "Halfwidth and Fullwidth Forms": range(65280, 65520),
194
+ "Specials": range(65520, 65536),
195
+ "Linear B Syllabary": range(65536, 65664),
196
+ "Linear B Ideograms": range(65664, 65792),
197
+ "Aegean Numbers": range(65792, 65856),
198
+ "Ancient Greek Numbers": range(65856, 65936),
199
+ "Ancient Symbols": range(65936, 66000),
200
+ "Phaistos Disc": range(66000, 66048),
201
+ "Lycian": range(66176, 66208),
202
+ "Carian": range(66208, 66272),
203
+ "Coptic Epact Numbers": range(66272, 66304),
204
+ "Old Italic": range(66304, 66352),
205
+ "Gothic": range(66352, 66384),
206
+ "Old Permic": range(66384, 66432),
207
+ "Ugaritic": range(66432, 66464),
208
+ "Old Persian": range(66464, 66528),
209
+ "Deseret": range(66560, 66640),
210
+ "Shavian": range(66640, 66688),
211
+ "Osmanya": range(66688, 66736),
212
+ "Osage": range(66736, 66816),
213
+ "Elbasan": range(66816, 66864),
214
+ "Caucasian Albanian": range(66864, 66928),
215
+ "Vithkuqi": range(66928, 67008),
216
+ "Linear A": range(67072, 67456),
217
+ "Latin Extended-F": range(67456, 67520),
218
+ "Cypriot Syllabary": range(67584, 67648),
219
+ "Imperial Aramaic": range(67648, 67680),
220
+ "Palmyrene": range(67680, 67712),
221
+ "Nabataean": range(67712, 67760),
222
+ "Hatran": range(67808, 67840),
223
+ "Phoenician": range(67840, 67872),
224
+ "Lydian": range(67872, 67904),
225
+ "Meroitic Hieroglyphs": range(67968, 68000),
226
+ "Meroitic Cursive": range(68000, 68096),
227
+ "Kharoshthi": range(68096, 68192),
228
+ "Old South Arabian": range(68192, 68224),
229
+ "Old North Arabian": range(68224, 68256),
230
+ "Manichaean": range(68288, 68352),
231
+ "Avestan": range(68352, 68416),
232
+ "Inscriptional Parthian": range(68416, 68448),
233
+ "Inscriptional Pahlavi": range(68448, 68480),
234
+ "Psalter Pahlavi": range(68480, 68528),
235
+ "Old Turkic": range(68608, 68688),
236
+ "Old Hungarian": range(68736, 68864),
237
+ "Hanifi Rohingya": range(68864, 68928),
238
+ "Rumi Numeral Symbols": range(69216, 69248),
239
+ "Yezidi": range(69248, 69312),
240
+ "Arabic Extended-C": range(69312, 69376),
241
+ "Old Sogdian": range(69376, 69424),
242
+ "Sogdian": range(69424, 69488),
243
+ "Old Uyghur": range(69488, 69552),
244
+ "Chorasmian": range(69552, 69600),
245
+ "Elymaic": range(69600, 69632),
246
+ "Brahmi": range(69632, 69760),
247
+ "Kaithi": range(69760, 69840),
248
+ "Sora Sompeng": range(69840, 69888),
249
+ "Chakma": range(69888, 69968),
250
+ "Mahajani": range(69968, 70016),
251
+ "Sharada": range(70016, 70112),
252
+ "Sinhala Archaic Numbers": range(70112, 70144),
253
+ "Khojki": range(70144, 70224),
254
+ "Multani": range(70272, 70320),
255
+ "Khudawadi": range(70320, 70400),
256
+ "Grantha": range(70400, 70528),
257
+ "Newa": range(70656, 70784),
258
+ "Tirhuta": range(70784, 70880),
259
+ "Siddham": range(71040, 71168),
260
+ "Modi": range(71168, 71264),
261
+ "Mongolian Supplement": range(71264, 71296),
262
+ "Takri": range(71296, 71376),
263
+ "Ahom": range(71424, 71504),
264
+ "Dogra": range(71680, 71760),
265
+ "Warang Citi": range(71840, 71936),
266
+ "Dives Akuru": range(71936, 72032),
267
+ "Nandinagari": range(72096, 72192),
268
+ "Zanabazar Square": range(72192, 72272),
269
+ "Soyombo": range(72272, 72368),
270
+ "Unified Canadian Aboriginal Syllabics Extended-A": range(72368, 72384),
271
+ "Pau Cin Hau": range(72384, 72448),
272
+ "Devanagari Extended-A": range(72448, 72544),
273
+ "Bhaiksuki": range(72704, 72816),
274
+ "Marchen": range(72816, 72896),
275
+ "Masaram Gondi": range(72960, 73056),
276
+ "Gunjala Gondi": range(73056, 73136),
277
+ "Makasar": range(73440, 73472),
278
+ "Kawi": range(73472, 73568),
279
+ "Lisu Supplement": range(73648, 73664),
280
+ "Tamil Supplement": range(73664, 73728),
281
+ "Cuneiform": range(73728, 74752),
282
+ "Cuneiform Numbers and Punctuation": range(74752, 74880),
283
+ "Early Dynastic Cuneiform": range(74880, 75088),
284
+ "Cypro-Minoan": range(77712, 77824),
285
+ "Egyptian Hieroglyphs": range(77824, 78896),
286
+ "Egyptian Hieroglyph Format Controls": range(78896, 78944),
287
+ "Anatolian Hieroglyphs": range(82944, 83584),
288
+ "Bamum Supplement": range(92160, 92736),
289
+ "Mro": range(92736, 92784),
290
+ "Tangsa": range(92784, 92880),
291
+ "Bassa Vah": range(92880, 92928),
292
+ "Pahawh Hmong": range(92928, 93072),
293
+ "Medefaidrin": range(93760, 93856),
294
+ "Miao": range(93952, 94112),
295
+ "Ideographic Symbols and Punctuation": range(94176, 94208),
296
+ "Tangut": range(94208, 100352),
297
+ "Tangut Components": range(100352, 101120),
298
+ "Khitan Small Script": range(101120, 101632),
299
+ "Tangut Supplement": range(101632, 101760),
300
+ "Kana Extended-B": range(110576, 110592),
301
+ "Kana Supplement": range(110592, 110848),
302
+ "Kana Extended-A": range(110848, 110896),
303
+ "Small Kana Extension": range(110896, 110960),
304
+ "Nushu": range(110960, 111360),
305
+ "Duployan": range(113664, 113824),
306
+ "Shorthand Format Controls": range(113824, 113840),
307
+ "Znamenny Musical Notation": range(118528, 118736),
308
+ "Byzantine Musical Symbols": range(118784, 119040),
309
+ "Musical Symbols": range(119040, 119296),
310
+ "Ancient Greek Musical Notation": range(119296, 119376),
311
+ "Kaktovik Numerals": range(119488, 119520),
312
+ "Mayan Numerals": range(119520, 119552),
313
+ "Tai Xuan Jing Symbols": range(119552, 119648),
314
+ "Counting Rod Numerals": range(119648, 119680),
315
+ "Mathematical Alphanumeric Symbols": range(119808, 120832),
316
+ "Sutton SignWriting": range(120832, 121520),
317
+ "Latin Extended-G": range(122624, 122880),
318
+ "Glagolitic Supplement": range(122880, 122928),
319
+ "Cyrillic Extended-D": range(122928, 123024),
320
+ "Nyiakeng Puachue Hmong": range(123136, 123216),
321
+ "Toto": range(123536, 123584),
322
+ "Wancho": range(123584, 123648),
323
+ "Nag Mundari": range(124112, 124160),
324
+ "Ethiopic Extended-B": range(124896, 124928),
325
+ "Mende Kikakui": range(124928, 125152),
326
+ "Adlam": range(125184, 125280),
327
+ "Indic Siyaq Numbers": range(126064, 126144),
328
+ "Ottoman Siyaq Numbers": range(126208, 126288),
329
+ "Arabic Mathematical Alphabetic Symbols": range(126464, 126720),
330
+ "Mahjong Tiles": range(126976, 127024),
331
+ "Domino Tiles": range(127024, 127136),
332
+ "Playing Cards": range(127136, 127232),
333
+ "Enclosed Alphanumeric Supplement": range(127232, 127488),
334
+ "Enclosed Ideographic Supplement": range(127488, 127744),
335
+ "Miscellaneous Symbols and Pictographs": range(127744, 128512),
336
+ "Emoticons range(Emoji)": range(128512, 128592),
337
+ "Ornamental Dingbats": range(128592, 128640),
338
+ "Transport and Map Symbols": range(128640, 128768),
339
+ "Alchemical Symbols": range(128768, 128896),
340
+ "Geometric Shapes Extended": range(128896, 129024),
341
+ "Supplemental Arrows-C": range(129024, 129280),
342
+ "Supplemental Symbols and Pictographs": range(129280, 129536),
343
+ "Chess Symbols": range(129536, 129648),
344
+ "Symbols and Pictographs Extended-A": range(129648, 129792),
345
+ "Symbols for Legacy Computing": range(129792, 130048),
346
+ "CJK Unified Ideographs Extension B": range(131072, 173792),
347
+ "CJK Unified Ideographs Extension C": range(173824, 177984),
348
+ "CJK Unified Ideographs Extension D": range(177984, 178208),
349
+ "CJK Unified Ideographs Extension E": range(178208, 183984),
350
+ "CJK Unified Ideographs Extension F": range(183984, 191472),
351
+ "CJK Compatibility Ideographs Supplement": range(194560, 195104),
352
+ "CJK Unified Ideographs Extension G": range(196608, 201552),
353
+ "CJK Unified Ideographs Extension H": range(201552, 205744),
354
+ "Tags": range(917504, 917632),
355
+ "Variation Selectors Supplement": range(917760, 918000),
356
+ "Supplementary Private Use Area-A": range(983040, 1048576),
357
+ "Supplementary Private Use Area-B": range(1048576, 1114112),
358
+ }
359
+
360
+
361
+ UNICODE_SECONDARY_RANGE_KEYWORD: list[str] = [
362
+ "Supplement",
363
+ "Extended",
364
+ "Extensions",
365
+ "Modifier",
366
+ "Marks",
367
+ "Punctuation",
368
+ "Symbols",
369
+ "Forms",
370
+ "Operators",
371
+ "Miscellaneous",
372
+ "Drawing",
373
+ "Block",
374
+ "Shapes",
375
+ "Supplemental",
376
+ "Tags",
377
+ ]
378
+
379
+ RE_POSSIBLE_ENCODING_INDICATION = re_compile(
380
+ r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
381
+ IGNORECASE,
382
+ )
383
+
384
+ IANA_NO_ALIASES = [
385
+ "cp720",
386
+ "cp737",
387
+ "cp856",
388
+ "cp874",
389
+ "cp875",
390
+ "cp1006",
391
+ "koi8_r",
392
+ "koi8_t",
393
+ "koi8_u",
394
+ ]
395
+
396
+ IANA_SUPPORTED: list[str] = sorted(
397
+ filter(
398
+ lambda x: x.endswith("_codec") is False
399
+ and x not in {"rot_13", "tactis", "mbcs"},
400
+ list(set(aliases.values())) + IANA_NO_ALIASES,
401
+ )
402
+ )
403
+
404
+ IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)
405
+
406
+ # pre-computed code page that are similar using the function cp_similarity.
407
+ IANA_SUPPORTED_SIMILAR: dict[str, list[str]] = {
408
+ "cp037": ["cp1026", "cp1140", "cp273", "cp500"],
409
+ "cp1026": ["cp037", "cp1140", "cp273", "cp500"],
410
+ "cp1125": ["cp866"],
411
+ "cp1140": ["cp037", "cp1026", "cp273", "cp500"],
412
+ "cp1250": ["iso8859_2"],
413
+ "cp1251": ["kz1048", "ptcp154"],
414
+ "cp1252": ["iso8859_15", "iso8859_9", "latin_1"],
415
+ "cp1253": ["iso8859_7"],
416
+ "cp1254": ["iso8859_15", "iso8859_9", "latin_1"],
417
+ "cp1257": ["iso8859_13"],
418
+ "cp273": ["cp037", "cp1026", "cp1140", "cp500"],
419
+ "cp437": ["cp850", "cp858", "cp860", "cp861", "cp862", "cp863", "cp865"],
420
+ "cp500": ["cp037", "cp1026", "cp1140", "cp273"],
421
+ "cp850": ["cp437", "cp857", "cp858", "cp865"],
422
+ "cp857": ["cp850", "cp858", "cp865"],
423
+ "cp858": ["cp437", "cp850", "cp857", "cp865"],
424
+ "cp860": ["cp437", "cp861", "cp862", "cp863", "cp865"],
425
+ "cp861": ["cp437", "cp860", "cp862", "cp863", "cp865"],
426
+ "cp862": ["cp437", "cp860", "cp861", "cp863", "cp865"],
427
+ "cp863": ["cp437", "cp860", "cp861", "cp862", "cp865"],
428
+ "cp865": ["cp437", "cp850", "cp857", "cp858", "cp860", "cp861", "cp862", "cp863"],
429
+ "cp866": ["cp1125"],
430
+ "iso8859_10": ["iso8859_14", "iso8859_15", "iso8859_4", "iso8859_9", "latin_1"],
431
+ "iso8859_11": ["tis_620"],
432
+ "iso8859_13": ["cp1257"],
433
+ "iso8859_14": [
434
+ "iso8859_10",
435
+ "iso8859_15",
436
+ "iso8859_16",
437
+ "iso8859_3",
438
+ "iso8859_9",
439
+ "latin_1",
440
+ ],
441
+ "iso8859_15": [
442
+ "cp1252",
443
+ "cp1254",
444
+ "iso8859_10",
445
+ "iso8859_14",
446
+ "iso8859_16",
447
+ "iso8859_3",
448
+ "iso8859_9",
449
+ "latin_1",
450
+ ],
451
+ "iso8859_16": [
452
+ "iso8859_14",
453
+ "iso8859_15",
454
+ "iso8859_2",
455
+ "iso8859_3",
456
+ "iso8859_9",
457
+ "latin_1",
458
+ ],
459
+ "iso8859_2": ["cp1250", "iso8859_16", "iso8859_4"],
460
+ "iso8859_3": ["iso8859_14", "iso8859_15", "iso8859_16", "iso8859_9", "latin_1"],
461
+ "iso8859_4": ["iso8859_10", "iso8859_2", "iso8859_9", "latin_1"],
462
+ "iso8859_7": ["cp1253"],
463
+ "iso8859_9": [
464
+ "cp1252",
465
+ "cp1254",
466
+ "cp1258",
467
+ "iso8859_10",
468
+ "iso8859_14",
469
+ "iso8859_15",
470
+ "iso8859_16",
471
+ "iso8859_3",
472
+ "iso8859_4",
473
+ "latin_1",
474
+ ],
475
+ "kz1048": ["cp1251", "ptcp154"],
476
+ "latin_1": [
477
+ "cp1252",
478
+ "cp1254",
479
+ "cp1258",
480
+ "iso8859_10",
481
+ "iso8859_14",
482
+ "iso8859_15",
483
+ "iso8859_16",
484
+ "iso8859_3",
485
+ "iso8859_4",
486
+ "iso8859_9",
487
+ ],
488
+ "mac_iceland": ["mac_roman", "mac_turkish"],
489
+ "mac_roman": ["mac_iceland", "mac_turkish"],
490
+ "mac_turkish": ["mac_iceland", "mac_roman"],
491
+ "ptcp154": ["cp1251", "kz1048"],
492
+ "tis_620": ["iso8859_11"],
493
+ }
494
+
495
+
496
+ CHARDET_CORRESPONDENCE: dict[str, str] = {
497
+ "iso2022_kr": "ISO-2022-KR",
498
+ "iso2022_jp": "ISO-2022-JP",
499
+ "euc_kr": "EUC-KR",
500
+ "tis_620": "TIS-620",
501
+ "utf_32": "UTF-32",
502
+ "euc_jp": "EUC-JP",
503
+ "koi8_r": "KOI8-R",
504
+ "iso8859_1": "ISO-8859-1",
505
+ "iso8859_2": "ISO-8859-2",
506
+ "iso8859_5": "ISO-8859-5",
507
+ "iso8859_6": "ISO-8859-6",
508
+ "iso8859_7": "ISO-8859-7",
509
+ "iso8859_8": "ISO-8859-8",
510
+ "utf_16": "UTF-16",
511
+ "cp855": "IBM855",
512
+ "mac_cyrillic": "MacCyrillic",
513
+ "gb2312": "GB2312",
514
+ "gb18030": "GB18030",
515
+ "cp932": "CP932",
516
+ "cp866": "IBM866",
517
+ "utf_8": "utf-8",
518
+ "utf_8_sig": "UTF-8-SIG",
519
+ "shift_jis": "SHIFT_JIS",
520
+ "big5": "Big5",
521
+ "cp1250": "windows-1250",
522
+ "cp1251": "windows-1251",
523
+ "cp1252": "Windows-1252",
524
+ "cp1253": "windows-1253",
525
+ "cp1255": "windows-1255",
526
+ "cp1256": "windows-1256",
527
+ "cp1254": "Windows-1254",
528
+ "cp949": "CP949",
529
+ }
530
+
531
+
532
+ COMMON_SAFE_ASCII_CHARACTERS: set[str] = {
533
+ "<",
534
+ ">",
535
+ "=",
536
+ ":",
537
+ "/",
538
+ "&",
539
+ ";",
540
+ "{",
541
+ "}",
542
+ "[",
543
+ "]",
544
+ ",",
545
+ "|",
546
+ '"',
547
+ "-",
548
+ "(",
549
+ ")",
550
+ }
551
+
552
+
553
+ KO_NAMES: set[str] = {"johab", "cp949", "euc_kr"}
554
+ ZH_NAMES: set[str] = {"big5", "cp950", "big5hkscs", "hz"}
555
+
556
+ # Logging LEVEL below DEBUG
557
+ TRACE: int = 5
558
+
559
+
560
+ # Language label that contain the em dash "—"
561
+ # character are to be considered alternative seq to origin
562
+ FREQUENCIES: dict[str, list[str]] = {
563
+ "English": [
564
+ "e",
565
+ "a",
566
+ "t",
567
+ "i",
568
+ "o",
569
+ "n",
570
+ "s",
571
+ "r",
572
+ "h",
573
+ "l",
574
+ "d",
575
+ "c",
576
+ "u",
577
+ "m",
578
+ "f",
579
+ "p",
580
+ "g",
581
+ "w",
582
+ "y",
583
+ "b",
584
+ "v",
585
+ "k",
586
+ "x",
587
+ "j",
588
+ "z",
589
+ "q",
590
+ ],
591
+ "English—": [
592
+ "e",
593
+ "a",
594
+ "t",
595
+ "i",
596
+ "o",
597
+ "n",
598
+ "s",
599
+ "r",
600
+ "h",
601
+ "l",
602
+ "d",
603
+ "c",
604
+ "m",
605
+ "u",
606
+ "f",
607
+ "p",
608
+ "g",
609
+ "w",
610
+ "b",
611
+ "y",
612
+ "v",
613
+ "k",
614
+ "j",
615
+ "x",
616
+ "z",
617
+ "q",
618
+ ],
619
+ "German": [
620
+ "e",
621
+ "n",
622
+ "i",
623
+ "r",
624
+ "s",
625
+ "t",
626
+ "a",
627
+ "d",
628
+ "h",
629
+ "u",
630
+ "l",
631
+ "g",
632
+ "o",
633
+ "c",
634
+ "m",
635
+ "b",
636
+ "f",
637
+ "k",
638
+ "w",
639
+ "z",
640
+ "p",
641
+ "v",
642
+ "ü",
643
+ "ä",
644
+ "ö",
645
+ "j",
646
+ ],
647
+ "French": [
648
+ "e",
649
+ "a",
650
+ "s",
651
+ "n",
652
+ "i",
653
+ "t",
654
+ "r",
655
+ "l",
656
+ "u",
657
+ "o",
658
+ "d",
659
+ "c",
660
+ "p",
661
+ "m",
662
+ "é",
663
+ "v",
664
+ "g",
665
+ "f",
666
+ "b",
667
+ "h",
668
+ "q",
669
+ "à",
670
+ "x",
671
+ "è",
672
+ "y",
673
+ "j",
674
+ ],
675
+ "Dutch": [
676
+ "e",
677
+ "n",
678
+ "a",
679
+ "i",
680
+ "r",
681
+ "t",
682
+ "o",
683
+ "d",
684
+ "s",
685
+ "l",
686
+ "g",
687
+ "h",
688
+ "v",
689
+ "m",
690
+ "u",
691
+ "k",
692
+ "c",
693
+ "p",
694
+ "b",
695
+ "w",
696
+ "j",
697
+ "z",
698
+ "f",
699
+ "y",
700
+ "x",
701
+ "ë",
702
+ ],
703
+ "Italian": [
704
+ "e",
705
+ "i",
706
+ "a",
707
+ "o",
708
+ "n",
709
+ "l",
710
+ "t",
711
+ "r",
712
+ "s",
713
+ "c",
714
+ "d",
715
+ "u",
716
+ "p",
717
+ "m",
718
+ "g",
719
+ "v",
720
+ "f",
721
+ "b",
722
+ "z",
723
+ "h",
724
+ "q",
725
+ "è",
726
+ "à",
727
+ "k",
728
+ "y",
729
+ "ò",
730
+ ],
731
+ "Polish": [
732
+ "a",
733
+ "i",
734
+ "o",
735
+ "e",
736
+ "n",
737
+ "r",
738
+ "z",
739
+ "w",
740
+ "s",
741
+ "c",
742
+ "t",
743
+ "k",
744
+ "y",
745
+ "d",
746
+ "p",
747
+ "m",
748
+ "u",
749
+ "l",
750
+ "j",
751
+ "ł",
752
+ "g",
753
+ "b",
754
+ "h",
755
+ "ą",
756
+ "ę",
757
+ "ó",
758
+ ],
759
+ "Spanish": [
760
+ "e",
761
+ "a",
762
+ "o",
763
+ "n",
764
+ "s",
765
+ "r",
766
+ "i",
767
+ "l",
768
+ "d",
769
+ "t",
770
+ "c",
771
+ "u",
772
+ "m",
773
+ "p",
774
+ "b",
775
+ "g",
776
+ "v",
777
+ "f",
778
+ "y",
779
+ "ó",
780
+ "h",
781
+ "q",
782
+ "í",
783
+ "j",
784
+ "z",
785
+ "á",
786
+ ],
787
+ "Russian": [
788
+ "о",
789
+ "а",
790
+ "е",
791
+ "и",
792
+ "н",
793
+ "с",
794
+ "т",
795
+ "р",
796
+ "в",
797
+ "л",
798
+ "к",
799
+ "м",
800
+ "д",
801
+ "п",
802
+ "у",
803
+ "г",
804
+ "я",
805
+ "ы",
806
+ "з",
807
+ "б",
808
+ "й",
809
+ "ь",
810
+ "ч",
811
+ "х",
812
+ "ж",
813
+ "ц",
814
+ ],
815
+ # Jap-Kanji
816
+ "Japanese": [
817
+ "人",
818
+ "一",
819
+ "大",
820
+ "亅",
821
+ "丁",
822
+ "丨",
823
+ "竹",
824
+ "笑",
825
+ "口",
826
+ "日",
827
+ "今",
828
+ "二",
829
+ "彳",
830
+ "行",
831
+ "十",
832
+ "土",
833
+ "丶",
834
+ "寸",
835
+ "寺",
836
+ "時",
837
+ "乙",
838
+ "丿",
839
+ "乂",
840
+ "气",
841
+ "気",
842
+ "冂",
843
+ "巾",
844
+ "亠",
845
+ "市",
846
+ "目",
847
+ "儿",
848
+ "見",
849
+ "八",
850
+ "小",
851
+ "凵",
852
+ "県",
853
+ "月",
854
+ "彐",
855
+ "門",
856
+ "間",
857
+ "木",
858
+ "東",
859
+ "山",
860
+ "出",
861
+ "本",
862
+ "中",
863
+ "刀",
864
+ "分",
865
+ "耳",
866
+ "又",
867
+ "取",
868
+ "最",
869
+ "言",
870
+ "田",
871
+ "心",
872
+ "思",
873
+ "刂",
874
+ "前",
875
+ "京",
876
+ "尹",
877
+ "事",
878
+ "生",
879
+ "厶",
880
+ "云",
881
+ "会",
882
+ "未",
883
+ "来",
884
+ "白",
885
+ "冫",
886
+ "楽",
887
+ "灬",
888
+ "馬",
889
+ "尸",
890
+ "尺",
891
+ "駅",
892
+ "明",
893
+ "耂",
894
+ "者",
895
+ "了",
896
+ "阝",
897
+ "都",
898
+ "高",
899
+ "卜",
900
+ "占",
901
+ "厂",
902
+ "广",
903
+ "店",
904
+ "子",
905
+ "申",
906
+ "奄",
907
+ "亻",
908
+ "俺",
909
+ "上",
910
+ "方",
911
+ "冖",
912
+ "学",
913
+ "衣",
914
+ "艮",
915
+ "食",
916
+ "自",
917
+ ],
918
+ # Jap-Katakana
919
+ "Japanese—": [
920
+ "ー",
921
+ "ン",
922
+ "ス",
923
+ "・",
924
+ "ル",
925
+ "ト",
926
+ "リ",
927
+ "イ",
928
+ "ア",
929
+ "ラ",
930
+ "ッ",
931
+ "ク",
932
+ "ド",
933
+ "シ",
934
+ "レ",
935
+ "ジ",
936
+ "タ",
937
+ "フ",
938
+ "ロ",
939
+ "カ",
940
+ "テ",
941
+ "マ",
942
+ "ィ",
943
+ "グ",
944
+ "バ",
945
+ "ム",
946
+ "プ",
947
+ "オ",
948
+ "コ",
949
+ "デ",
950
+ "ニ",
951
+ "ウ",
952
+ "メ",
953
+ "サ",
954
+ "ビ",
955
+ "ナ",
956
+ "ブ",
957
+ "ャ",
958
+ "エ",
959
+ "ュ",
960
+ "チ",
961
+ "キ",
962
+ "ズ",
963
+ "ダ",
964
+ "パ",
965
+ "ミ",
966
+ "ェ",
967
+ "ョ",
968
+ "ハ",
969
+ "セ",
970
+ "ベ",
971
+ "ガ",
972
+ "モ",
973
+ "ツ",
974
+ "ネ",
975
+ "ボ",
976
+ "ソ",
977
+ "ノ",
978
+ "ァ",
979
+ "ヴ",
980
+ "ワ",
981
+ "ポ",
982
+ "ペ",
983
+ "ピ",
984
+ "ケ",
985
+ "ゴ",
986
+ "ギ",
987
+ "ザ",
988
+ "ホ",
989
+ "ゲ",
990
+ "ォ",
991
+ "ヤ",
992
+ "ヒ",
993
+ "ユ",
994
+ "ヨ",
995
+ "ヘ",
996
+ "ゼ",
997
+ "ヌ",
998
+ "ゥ",
999
+ "ゾ",
1000
+ "ヶ",
1001
+ "ヂ",
1002
+ "ヲ",
1003
+ "ヅ",
1004
+ "ヵ",
1005
+ "ヱ",
1006
+ "ヰ",
1007
+ "ヮ",
1008
+ "ヽ",
1009
+ "゠",
1010
+ "ヾ",
1011
+ "ヷ",
1012
+ "ヿ",
1013
+ "ヸ",
1014
+ "ヹ",
1015
+ "ヺ",
1016
+ ],
1017
+ # Jap-Hiragana
1018
+ "Japanese——": [
1019
+ "の",
1020
+ "に",
1021
+ "る",
1022
+ "た",
1023
+ "と",
1024
+ "は",
1025
+ "し",
1026
+ "い",
1027
+ "を",
1028
+ "で",
1029
+ "て",
1030
+ "が",
1031
+ "な",
1032
+ "れ",
1033
+ "か",
1034
+ "ら",
1035
+ "さ",
1036
+ "っ",
1037
+ "り",
1038
+ "す",
1039
+ "あ",
1040
+ "も",
1041
+ "こ",
1042
+ "ま",
1043
+ "う",
1044
+ "く",
1045
+ "よ",
1046
+ "き",
1047
+ "ん",
1048
+ "め",
1049
+ "お",
1050
+ "け",
1051
+ "そ",
1052
+ "つ",
1053
+ "だ",
1054
+ "や",
1055
+ "え",
1056
+ "ど",
1057
+ "わ",
1058
+ "ち",
1059
+ "み",
1060
+ "せ",
1061
+ "じ",
1062
+ "ば",
1063
+ "へ",
1064
+ "び",
1065
+ "ず",
1066
+ "ろ",
1067
+ "ほ",
1068
+ "げ",
1069
+ "む",
1070
+ "べ",
1071
+ "ひ",
1072
+ "ょ",
1073
+ "ゆ",
1074
+ "ぶ",
1075
+ "ご",
1076
+ "ゃ",
1077
+ "ね",
1078
+ "ふ",
1079
+ "ぐ",
1080
+ "ぎ",
1081
+ "ぼ",
1082
+ "ゅ",
1083
+ "づ",
1084
+ "ざ",
1085
+ "ぞ",
1086
+ "ぬ",
1087
+ "ぜ",
1088
+ "ぱ",
1089
+ "ぽ",
1090
+ "ぷ",
1091
+ "ぴ",
1092
+ "ぃ",
1093
+ "ぁ",
1094
+ "ぇ",
1095
+ "ぺ",
1096
+ "ゞ",
1097
+ "ぢ",
1098
+ "ぉ",
1099
+ "ぅ",
1100
+ "ゐ",
1101
+ "ゝ",
1102
+ "ゑ",
1103
+ "゛",
1104
+ "゜",
1105
+ "ゎ",
1106
+ "ゔ",
1107
+ "゚",
1108
+ "ゟ",
1109
+ "゙",
1110
+ "ゕ",
1111
+ "ゖ",
1112
+ ],
1113
+ "Portuguese": [
1114
+ "a",
1115
+ "e",
1116
+ "o",
1117
+ "s",
1118
+ "i",
1119
+ "r",
1120
+ "d",
1121
+ "n",
1122
+ "t",
1123
+ "m",
1124
+ "u",
1125
+ "c",
1126
+ "l",
1127
+ "p",
1128
+ "g",
1129
+ "v",
1130
+ "b",
1131
+ "f",
1132
+ "h",
1133
+ "ã",
1134
+ "q",
1135
+ "é",
1136
+ "ç",
1137
+ "á",
1138
+ "z",
1139
+ "í",
1140
+ ],
1141
+ "Swedish": [
1142
+ "e",
1143
+ "a",
1144
+ "n",
1145
+ "r",
1146
+ "t",
1147
+ "s",
1148
+ "i",
1149
+ "l",
1150
+ "d",
1151
+ "o",
1152
+ "m",
1153
+ "k",
1154
+ "g",
1155
+ "v",
1156
+ "h",
1157
+ "f",
1158
+ "u",
1159
+ "p",
1160
+ "ä",
1161
+ "c",
1162
+ "b",
1163
+ "ö",
1164
+ "å",
1165
+ "y",
1166
+ "j",
1167
+ "x",
1168
+ ],
1169
+ "Chinese": [
1170
+ "的",
1171
+ "一",
1172
+ "是",
1173
+ "不",
1174
+ "了",
1175
+ "在",
1176
+ "人",
1177
+ "有",
1178
+ "我",
1179
+ "他",
1180
+ "这",
1181
+ "个",
1182
+ "们",
1183
+ "中",
1184
+ "来",
1185
+ "上",
1186
+ "大",
1187
+ "为",
1188
+ "和",
1189
+ "国",
1190
+ "地",
1191
+ "到",
1192
+ "以",
1193
+ "说",
1194
+ "时",
1195
+ "要",
1196
+ "就",
1197
+ "出",
1198
+ "会",
1199
+ "可",
1200
+ "也",
1201
+ "你",
1202
+ "对",
1203
+ "生",
1204
+ "能",
1205
+ "而",
1206
+ "子",
1207
+ "那",
1208
+ "得",
1209
+ "于",
1210
+ "着",
1211
+ "下",
1212
+ "自",
1213
+ "之",
1214
+ "年",
1215
+ "过",
1216
+ "发",
1217
+ "后",
1218
+ "作",
1219
+ "里",
1220
+ "用",
1221
+ "道",
1222
+ "行",
1223
+ "所",
1224
+ "然",
1225
+ "家",
1226
+ "种",
1227
+ "事",
1228
+ "成",
1229
+ "方",
1230
+ "多",
1231
+ "经",
1232
+ "么",
1233
+ "去",
1234
+ "法",
1235
+ "学",
1236
+ "如",
1237
+ "都",
1238
+ "同",
1239
+ "现",
1240
+ "当",
1241
+ "没",
1242
+ "动",
1243
+ "面",
1244
+ "起",
1245
+ "看",
1246
+ "定",
1247
+ "天",
1248
+ "分",
1249
+ "还",
1250
+ "进",
1251
+ "好",
1252
+ "小",
1253
+ "部",
1254
+ "其",
1255
+ "些",
1256
+ "主",
1257
+ "样",
1258
+ "理",
1259
+ "心",
1260
+ "她",
1261
+ "本",
1262
+ "前",
1263
+ "开",
1264
+ "但",
1265
+ "因",
1266
+ "只",
1267
+ "从",
1268
+ "想",
1269
+ "实",
1270
+ ],
1271
+ "Ukrainian": [
1272
+ "о",
1273
+ "а",
1274
+ "н",
1275
+ "і",
1276
+ "и",
1277
+ "р",
1278
+ "в",
1279
+ "т",
1280
+ "е",
1281
+ "с",
1282
+ "к",
1283
+ "л",
1284
+ "у",
1285
+ "д",
1286
+ "м",
1287
+ "п",
1288
+ "з",
1289
+ "я",
1290
+ "ь",
1291
+ "б",
1292
+ "г",
1293
+ "й",
1294
+ "ч",
1295
+ "х",
1296
+ "ц",
1297
+ "ї",
1298
+ ],
1299
+ "Norwegian": [
1300
+ "e",
1301
+ "r",
1302
+ "n",
1303
+ "t",
1304
+ "a",
1305
+ "s",
1306
+ "i",
1307
+ "o",
1308
+ "l",
1309
+ "d",
1310
+ "g",
1311
+ "k",
1312
+ "m",
1313
+ "v",
1314
+ "f",
1315
+ "p",
1316
+ "u",
1317
+ "b",
1318
+ "h",
1319
+ "å",
1320
+ "y",
1321
+ "j",
1322
+ "ø",
1323
+ "c",
1324
+ "æ",
1325
+ "w",
1326
+ ],
1327
+ "Finnish": [
1328
+ "a",
1329
+ "i",
1330
+ "n",
1331
+ "t",
1332
+ "e",
1333
+ "s",
1334
+ "l",
1335
+ "o",
1336
+ "u",
1337
+ "k",
1338
+ "ä",
1339
+ "m",
1340
+ "r",
1341
+ "v",
1342
+ "j",
1343
+ "h",
1344
+ "p",
1345
+ "y",
1346
+ "d",
1347
+ "ö",
1348
+ "g",
1349
+ "c",
1350
+ "b",
1351
+ "f",
1352
+ "w",
1353
+ "z",
1354
+ ],
1355
+ "Vietnamese": [
1356
+ "n",
1357
+ "h",
1358
+ "t",
1359
+ "i",
1360
+ "c",
1361
+ "g",
1362
+ "a",
1363
+ "o",
1364
+ "u",
1365
+ "m",
1366
+ "l",
1367
+ "r",
1368
+ "à",
1369
+ "đ",
1370
+ "s",
1371
+ "e",
1372
+ "v",
1373
+ "p",
1374
+ "b",
1375
+ "y",
1376
+ "ư",
1377
+ "d",
1378
+ "á",
1379
+ "k",
1380
+ "ộ",
1381
+ "ế",
1382
+ ],
1383
+ "Czech": [
1384
+ "o",
1385
+ "e",
1386
+ "a",
1387
+ "n",
1388
+ "t",
1389
+ "s",
1390
+ "i",
1391
+ "l",
1392
+ "v",
1393
+ "r",
1394
+ "k",
1395
+ "d",
1396
+ "u",
1397
+ "m",
1398
+ "p",
1399
+ "í",
1400
+ "c",
1401
+ "h",
1402
+ "z",
1403
+ "á",
1404
+ "y",
1405
+ "j",
1406
+ "b",
1407
+ "ě",
1408
+ "é",
1409
+ "ř",
1410
+ ],
1411
+ "Hungarian": [
1412
+ "e",
1413
+ "a",
1414
+ "t",
1415
+ "l",
1416
+ "s",
1417
+ "n",
1418
+ "k",
1419
+ "r",
1420
+ "i",
1421
+ "o",
1422
+ "z",
1423
+ "á",
1424
+ "é",
1425
+ "g",
1426
+ "m",
1427
+ "b",
1428
+ "y",
1429
+ "v",
1430
+ "d",
1431
+ "h",
1432
+ "u",
1433
+ "p",
1434
+ "j",
1435
+ "ö",
1436
+ "f",
1437
+ "c",
1438
+ ],
1439
+ "Korean": [
1440
+ "이",
1441
+ "다",
1442
+ "에",
1443
+ "의",
1444
+ "는",
1445
+ "로",
1446
+ "하",
1447
+ "을",
1448
+ "가",
1449
+ "고",
1450
+ "지",
1451
+ "서",
1452
+ "한",
1453
+ "은",
1454
+ "기",
1455
+ "으",
1456
+ "년",
1457
+ "대",
1458
+ "사",
1459
+ "시",
1460
+ "를",
1461
+ "리",
1462
+ "도",
1463
+ "인",
1464
+ "스",
1465
+ "일",
1466
+ ],
1467
+ "Indonesian": [
1468
+ "a",
1469
+ "n",
1470
+ "e",
1471
+ "i",
1472
+ "r",
1473
+ "t",
1474
+ "u",
1475
+ "s",
1476
+ "d",
1477
+ "k",
1478
+ "m",
1479
+ "l",
1480
+ "g",
1481
+ "p",
1482
+ "b",
1483
+ "o",
1484
+ "h",
1485
+ "y",
1486
+ "j",
1487
+ "c",
1488
+ "w",
1489
+ "f",
1490
+ "v",
1491
+ "z",
1492
+ "x",
1493
+ "q",
1494
+ ],
1495
+ "Turkish": [
1496
+ "a",
1497
+ "e",
1498
+ "i",
1499
+ "n",
1500
+ "r",
1501
+ "l",
1502
+ "ı",
1503
+ "k",
1504
+ "d",
1505
+ "t",
1506
+ "s",
1507
+ "m",
1508
+ "y",
1509
+ "u",
1510
+ "o",
1511
+ "b",
1512
+ "ü",
1513
+ "ş",
1514
+ "v",
1515
+ "g",
1516
+ "z",
1517
+ "h",
1518
+ "c",
1519
+ "p",
1520
+ "ç",
1521
+ "ğ",
1522
+ ],
1523
+ "Romanian": [
1524
+ "e",
1525
+ "i",
1526
+ "a",
1527
+ "r",
1528
+ "n",
1529
+ "t",
1530
+ "u",
1531
+ "l",
1532
+ "o",
1533
+ "c",
1534
+ "s",
1535
+ "d",
1536
+ "p",
1537
+ "m",
1538
+ "ă",
1539
+ "f",
1540
+ "v",
1541
+ "î",
1542
+ "g",
1543
+ "b",
1544
+ "ș",
1545
+ "ț",
1546
+ "z",
1547
+ "h",
1548
+ "â",
1549
+ "j",
1550
+ ],
1551
+ "Farsi": [
1552
+ "ا",
1553
+ "ی",
1554
+ "ر",
1555
+ "د",
1556
+ "ن",
1557
+ "ه",
1558
+ "و",
1559
+ "م",
1560
+ "ت",
1561
+ "ب",
1562
+ "س",
1563
+ "ل",
1564
+ "ک",
1565
+ "ش",
1566
+ "ز",
1567
+ "ف",
1568
+ "گ",
1569
+ "ع",
1570
+ "خ",
1571
+ "ق",
1572
+ "ج",
1573
+ "آ",
1574
+ "پ",
1575
+ "ح",
1576
+ "ط",
1577
+ "ص",
1578
+ ],
1579
+ "Arabic": [
1580
+ "ا",
1581
+ "ل",
1582
+ "ي",
1583
+ "م",
1584
+ "و",
1585
+ "ن",
1586
+ "ر",
1587
+ "ت",
1588
+ "ب",
1589
+ "ة",
1590
+ "ع",
1591
+ "د",
1592
+ "س",
1593
+ "ف",
1594
+ "ه",
1595
+ "ك",
1596
+ "ق",
1597
+ "أ",
1598
+ "ح",
1599
+ "ج",
1600
+ "ش",
1601
+ "ط",
1602
+ "ص",
1603
+ "ى",
1604
+ "خ",
1605
+ "إ",
1606
+ ],
1607
+ "Danish": [
1608
+ "e",
1609
+ "r",
1610
+ "n",
1611
+ "t",
1612
+ "a",
1613
+ "i",
1614
+ "s",
1615
+ "d",
1616
+ "l",
1617
+ "o",
1618
+ "g",
1619
+ "m",
1620
+ "k",
1621
+ "f",
1622
+ "v",
1623
+ "u",
1624
+ "b",
1625
+ "h",
1626
+ "p",
1627
+ "å",
1628
+ "y",
1629
+ "ø",
1630
+ "æ",
1631
+ "c",
1632
+ "j",
1633
+ "w",
1634
+ ],
1635
+ "Serbian": [
1636
+ "а",
1637
+ "и",
1638
+ "о",
1639
+ "е",
1640
+ "н",
1641
+ "р",
1642
+ "с",
1643
+ "у",
1644
+ "т",
1645
+ "к",
1646
+ "ј",
1647
+ "в",
1648
+ "д",
1649
+ "м",
1650
+ "п",
1651
+ "л",
1652
+ "г",
1653
+ "з",
1654
+ "б",
1655
+ "a",
1656
+ "i",
1657
+ "e",
1658
+ "o",
1659
+ "n",
1660
+ "ц",
1661
+ "ш",
1662
+ ],
1663
+ "Lithuanian": [
1664
+ "i",
1665
+ "a",
1666
+ "s",
1667
+ "o",
1668
+ "r",
1669
+ "e",
1670
+ "t",
1671
+ "n",
1672
+ "u",
1673
+ "k",
1674
+ "m",
1675
+ "l",
1676
+ "p",
1677
+ "v",
1678
+ "d",
1679
+ "j",
1680
+ "g",
1681
+ "ė",
1682
+ "b",
1683
+ "y",
1684
+ "ų",
1685
+ "š",
1686
+ "ž",
1687
+ "c",
1688
+ "ą",
1689
+ "į",
1690
+ ],
1691
+ "Slovene": [
1692
+ "e",
1693
+ "a",
1694
+ "i",
1695
+ "o",
1696
+ "n",
1697
+ "r",
1698
+ "s",
1699
+ "l",
1700
+ "t",
1701
+ "j",
1702
+ "v",
1703
+ "k",
1704
+ "d",
1705
+ "p",
1706
+ "m",
1707
+ "u",
1708
+ "z",
1709
+ "b",
1710
+ "g",
1711
+ "h",
1712
+ "č",
1713
+ "c",
1714
+ "š",
1715
+ "ž",
1716
+ "f",
1717
+ "y",
1718
+ ],
1719
+ "Slovak": [
1720
+ "o",
1721
+ "a",
1722
+ "e",
1723
+ "n",
1724
+ "i",
1725
+ "r",
1726
+ "v",
1727
+ "t",
1728
+ "s",
1729
+ "l",
1730
+ "k",
1731
+ "d",
1732
+ "m",
1733
+ "p",
1734
+ "u",
1735
+ "c",
1736
+ "h",
1737
+ "j",
1738
+ "b",
1739
+ "z",
1740
+ "á",
1741
+ "y",
1742
+ "ý",
1743
+ "í",
1744
+ "č",
1745
+ "é",
1746
+ ],
1747
+ "Hebrew": [
1748
+ "י",
1749
+ "ו",
1750
+ "ה",
1751
+ "ל",
1752
+ "ר",
1753
+ "ב",
1754
+ "ת",
1755
+ "מ",
1756
+ "א",
1757
+ "ש",
1758
+ "נ",
1759
+ "ע",
1760
+ "ם",
1761
+ "ד",
1762
+ "ק",
1763
+ "ח",
1764
+ "פ",
1765
+ "ס",
1766
+ "כ",
1767
+ "ג",
1768
+ "ט",
1769
+ "צ",
1770
+ "ן",
1771
+ "ז",
1772
+ "ך",
1773
+ ],
1774
+ "Bulgarian": [
1775
+ "а",
1776
+ "и",
1777
+ "о",
1778
+ "е",
1779
+ "н",
1780
+ "т",
1781
+ "р",
1782
+ "с",
1783
+ "в",
1784
+ "л",
1785
+ "к",
1786
+ "д",
1787
+ "п",
1788
+ "м",
1789
+ "з",
1790
+ "г",
1791
+ "я",
1792
+ "ъ",
1793
+ "у",
1794
+ "б",
1795
+ "ч",
1796
+ "ц",
1797
+ "й",
1798
+ "ж",
1799
+ "щ",
1800
+ "х",
1801
+ ],
1802
+ "Croatian": [
1803
+ "a",
1804
+ "i",
1805
+ "o",
1806
+ "e",
1807
+ "n",
1808
+ "r",
1809
+ "j",
1810
+ "s",
1811
+ "t",
1812
+ "u",
1813
+ "k",
1814
+ "l",
1815
+ "v",
1816
+ "d",
1817
+ "m",
1818
+ "p",
1819
+ "g",
1820
+ "z",
1821
+ "b",
1822
+ "c",
1823
+ "č",
1824
+ "h",
1825
+ "š",
1826
+ "ž",
1827
+ "ć",
1828
+ "f",
1829
+ ],
1830
+ "Hindi": [
1831
+ "क",
1832
+ "र",
1833
+ "स",
1834
+ "न",
1835
+ "त",
1836
+ "म",
1837
+ "ह",
1838
+ "प",
1839
+ "य",
1840
+ "ल",
1841
+ "व",
1842
+ "ज",
1843
+ "द",
1844
+ "ग",
1845
+ "ब",
1846
+ "श",
1847
+ "ट",
1848
+ "अ",
1849
+ "ए",
1850
+ "थ",
1851
+ "भ",
1852
+ "ड",
1853
+ "च",
1854
+ "ध",
1855
+ "ष",
1856
+ "इ",
1857
+ ],
1858
+ "Estonian": [
1859
+ "a",
1860
+ "i",
1861
+ "e",
1862
+ "s",
1863
+ "t",
1864
+ "l",
1865
+ "u",
1866
+ "n",
1867
+ "o",
1868
+ "k",
1869
+ "r",
1870
+ "d",
1871
+ "m",
1872
+ "v",
1873
+ "g",
1874
+ "p",
1875
+ "j",
1876
+ "h",
1877
+ "ä",
1878
+ "b",
1879
+ "õ",
1880
+ "ü",
1881
+ "f",
1882
+ "c",
1883
+ "ö",
1884
+ "y",
1885
+ ],
1886
+ "Thai": [
1887
+ "า",
1888
+ "น",
1889
+ "ร",
1890
+ "อ",
1891
+ "ก",
1892
+ "เ",
1893
+ "ง",
1894
+ "ม",
1895
+ "ย",
1896
+ "ล",
1897
+ "ว",
1898
+ "ด",
1899
+ "ท",
1900
+ "ส",
1901
+ "ต",
1902
+ "ะ",
1903
+ "ป",
1904
+ "บ",
1905
+ "ค",
1906
+ "ห",
1907
+ "แ",
1908
+ "จ",
1909
+ "พ",
1910
+ "ช",
1911
+ "ข",
1912
+ "ใ",
1913
+ ],
1914
+ "Greek": [
1915
+ "α",
1916
+ "τ",
1917
+ "ο",
1918
+ "ι",
1919
+ "ε",
1920
+ "ν",
1921
+ "ρ",
1922
+ "σ",
1923
+ "κ",
1924
+ "η",
1925
+ "π",
1926
+ "ς",
1927
+ "υ",
1928
+ "μ",
1929
+ "λ",
1930
+ "ί",
1931
+ "ό",
1932
+ "ά",
1933
+ "γ",
1934
+ "έ",
1935
+ "δ",
1936
+ "ή",
1937
+ "ω",
1938
+ "χ",
1939
+ "θ",
1940
+ "ύ",
1941
+ ],
1942
+ "Tamil": [
1943
+ "க",
1944
+ "த",
1945
+ "ப",
1946
+ "ட",
1947
+ "ர",
1948
+ "ம",
1949
+ "ல",
1950
+ "ன",
1951
+ "வ",
1952
+ "ற",
1953
+ "ய",
1954
+ "ள",
1955
+ "ச",
1956
+ "ந",
1957
+ "இ",
1958
+ "ண",
1959
+ "அ",
1960
+ "ஆ",
1961
+ "ழ",
1962
+ "ங",
1963
+ "எ",
1964
+ "உ",
1965
+ "ஒ",
1966
+ "ஸ",
1967
+ ],
1968
+ "Kazakh": [
1969
+ "а",
1970
+ "ы",
1971
+ "е",
1972
+ "н",
1973
+ "т",
1974
+ "р",
1975
+ "л",
1976
+ "і",
1977
+ "д",
1978
+ "с",
1979
+ "м",
1980
+ "қ",
1981
+ "к",
1982
+ "о",
1983
+ "б",
1984
+ "и",
1985
+ "у",
1986
+ "ғ",
1987
+ "ж",
1988
+ "ң",
1989
+ "з",
1990
+ "ш",
1991
+ "й",
1992
+ "п",
1993
+ "г",
1994
+ "ө",
1995
+ ],
1996
+ }
1997
+
1998
+ LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES)
.venv/lib/python3.13/site-packages/charset_normalizer/legacy.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
4
+ from warnings import warn
5
+
6
+ from .api import from_bytes
7
+ from .constant import CHARDET_CORRESPONDENCE
8
+
9
+ # TODO: remove this check when dropping Python 3.7 support
10
+ if TYPE_CHECKING:
11
+ from typing_extensions import TypedDict
12
+
13
+ class ResultDict(TypedDict):
14
+ encoding: str | None
15
+ language: str
16
+ confidence: float | None
17
+
18
+
19
+ def detect(
20
+ byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
21
+ ) -> ResultDict:
22
+ """
23
+ chardet legacy method
24
+ Detect the encoding of the given byte string. It should be mostly backward-compatible.
25
+ Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
26
+ This function is deprecated and should be used to migrate your project easily, consult the documentation for
27
+ further information. Not planned for removal.
28
+
29
+ :param byte_str: The byte sequence to examine.
30
+ :param should_rename_legacy: Should we rename legacy encodings
31
+ to their more modern equivalents?
32
+ """
33
+ if len(kwargs):
34
+ warn(
35
+ f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
36
+ )
37
+
38
+ if not isinstance(byte_str, (bytearray, bytes)):
39
+ raise TypeError( # pragma: nocover
40
+ "Expected object of type bytes or bytearray, got: " "{}".format(
41
+ type(byte_str)
42
+ )
43
+ )
44
+
45
+ if isinstance(byte_str, bytearray):
46
+ byte_str = bytes(byte_str)
47
+
48
+ r = from_bytes(byte_str).best()
49
+
50
+ encoding = r.encoding if r is not None else None
51
+ language = r.language if r is not None and r.language != "Unknown" else ""
52
+ confidence = 1.0 - r.chaos if r is not None else None
53
+
54
+ # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
55
+ # but chardet does return 'utf-8-sig' and it is a valid codec name.
56
+ if r is not None and encoding == "utf_8" and r.bom:
57
+ encoding += "_sig"
58
+
59
+ if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
60
+ encoding = CHARDET_CORRESPONDENCE[encoding]
61
+
62
+ return {
63
+ "encoding": encoding,
64
+ "language": language,
65
+ "confidence": confidence,
66
+ }
.venv/lib/python3.13/site-packages/charset_normalizer/md.cpython-313-darwin.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:994bd264cafab72dffc8b5faf1867904942bf754477ec7e18890964021413271
3
+ size 115664
.venv/lib/python3.13/site-packages/charset_normalizer/md.py ADDED
@@ -0,0 +1,630 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from functools import lru_cache
4
+ from logging import getLogger
5
+
6
+ from .constant import (
7
+ COMMON_SAFE_ASCII_CHARACTERS,
8
+ TRACE,
9
+ UNICODE_SECONDARY_RANGE_KEYWORD,
10
+ )
11
+ from .utils import (
12
+ is_accentuated,
13
+ is_arabic,
14
+ is_arabic_isolated_form,
15
+ is_case_variable,
16
+ is_cjk,
17
+ is_emoticon,
18
+ is_hangul,
19
+ is_hiragana,
20
+ is_katakana,
21
+ is_latin,
22
+ is_punctuation,
23
+ is_separator,
24
+ is_symbol,
25
+ is_thai,
26
+ is_unprintable,
27
+ remove_accent,
28
+ unicode_range,
29
+ )
30
+
31
+
32
+ class MessDetectorPlugin:
33
+ """
34
+ Base abstract class used for mess detection plugins.
35
+ All detectors MUST extend and implement given methods.
36
+ """
37
+
38
+ def eligible(self, character: str) -> bool:
39
+ """
40
+ Determine if given character should be fed in.
41
+ """
42
+ raise NotImplementedError # pragma: nocover
43
+
44
+ def feed(self, character: str) -> None:
45
+ """
46
+ The main routine to be executed upon character.
47
+ Insert the logic in witch the text would be considered chaotic.
48
+ """
49
+ raise NotImplementedError # pragma: nocover
50
+
51
+ def reset(self) -> None: # pragma: no cover
52
+ """
53
+ Permit to reset the plugin to the initial state.
54
+ """
55
+ raise NotImplementedError
56
+
57
+ @property
58
+ def ratio(self) -> float:
59
+ """
60
+ Compute the chaos ratio based on what your feed() has seen.
61
+ Must NOT be lower than 0.; No restriction gt 0.
62
+ """
63
+ raise NotImplementedError # pragma: nocover
64
+
65
+
66
+ class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
67
+ def __init__(self) -> None:
68
+ self._punctuation_count: int = 0
69
+ self._symbol_count: int = 0
70
+ self._character_count: int = 0
71
+
72
+ self._last_printable_char: str | None = None
73
+ self._frenzy_symbol_in_word: bool = False
74
+
75
+ def eligible(self, character: str) -> bool:
76
+ return character.isprintable()
77
+
78
+ def feed(self, character: str) -> None:
79
+ self._character_count += 1
80
+
81
+ if (
82
+ character != self._last_printable_char
83
+ and character not in COMMON_SAFE_ASCII_CHARACTERS
84
+ ):
85
+ if is_punctuation(character):
86
+ self._punctuation_count += 1
87
+ elif (
88
+ character.isdigit() is False
89
+ and is_symbol(character)
90
+ and is_emoticon(character) is False
91
+ ):
92
+ self._symbol_count += 2
93
+
94
+ self._last_printable_char = character
95
+
96
+ def reset(self) -> None: # Abstract
97
+ self._punctuation_count = 0
98
+ self._character_count = 0
99
+ self._symbol_count = 0
100
+
101
+ @property
102
+ def ratio(self) -> float:
103
+ if self._character_count == 0:
104
+ return 0.0
105
+
106
+ ratio_of_punctuation: float = (
107
+ self._punctuation_count + self._symbol_count
108
+ ) / self._character_count
109
+
110
+ return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
111
+
112
+
113
+ class TooManyAccentuatedPlugin(MessDetectorPlugin):
114
+ def __init__(self) -> None:
115
+ self._character_count: int = 0
116
+ self._accentuated_count: int = 0
117
+
118
+ def eligible(self, character: str) -> bool:
119
+ return character.isalpha()
120
+
121
+ def feed(self, character: str) -> None:
122
+ self._character_count += 1
123
+
124
+ if is_accentuated(character):
125
+ self._accentuated_count += 1
126
+
127
+ def reset(self) -> None: # Abstract
128
+ self._character_count = 0
129
+ self._accentuated_count = 0
130
+
131
+ @property
132
+ def ratio(self) -> float:
133
+ if self._character_count < 8:
134
+ return 0.0
135
+
136
+ ratio_of_accentuation: float = self._accentuated_count / self._character_count
137
+ return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
138
+
139
+
140
+ class UnprintablePlugin(MessDetectorPlugin):
141
+ def __init__(self) -> None:
142
+ self._unprintable_count: int = 0
143
+ self._character_count: int = 0
144
+
145
+ def eligible(self, character: str) -> bool:
146
+ return True
147
+
148
+ def feed(self, character: str) -> None:
149
+ if is_unprintable(character):
150
+ self._unprintable_count += 1
151
+ self._character_count += 1
152
+
153
+ def reset(self) -> None: # Abstract
154
+ self._unprintable_count = 0
155
+
156
+ @property
157
+ def ratio(self) -> float:
158
+ if self._character_count == 0:
159
+ return 0.0
160
+
161
+ return (self._unprintable_count * 8) / self._character_count
162
+
163
+
164
+ class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
165
+ def __init__(self) -> None:
166
+ self._successive_count: int = 0
167
+ self._character_count: int = 0
168
+
169
+ self._last_latin_character: str | None = None
170
+
171
+ def eligible(self, character: str) -> bool:
172
+ return character.isalpha() and is_latin(character)
173
+
174
+ def feed(self, character: str) -> None:
175
+ self._character_count += 1
176
+ if (
177
+ self._last_latin_character is not None
178
+ and is_accentuated(character)
179
+ and is_accentuated(self._last_latin_character)
180
+ ):
181
+ if character.isupper() and self._last_latin_character.isupper():
182
+ self._successive_count += 1
183
+ # Worse if its the same char duplicated with different accent.
184
+ if remove_accent(character) == remove_accent(self._last_latin_character):
185
+ self._successive_count += 1
186
+ self._last_latin_character = character
187
+
188
+ def reset(self) -> None: # Abstract
189
+ self._successive_count = 0
190
+ self._character_count = 0
191
+ self._last_latin_character = None
192
+
193
+ @property
194
+ def ratio(self) -> float:
195
+ if self._character_count == 0:
196
+ return 0.0
197
+
198
+ return (self._successive_count * 2) / self._character_count
199
+
200
+
201
+ class SuspiciousRange(MessDetectorPlugin):
202
+ def __init__(self) -> None:
203
+ self._suspicious_successive_range_count: int = 0
204
+ self._character_count: int = 0
205
+ self._last_printable_seen: str | None = None
206
+
207
+ def eligible(self, character: str) -> bool:
208
+ return character.isprintable()
209
+
210
+ def feed(self, character: str) -> None:
211
+ self._character_count += 1
212
+
213
+ if (
214
+ character.isspace()
215
+ or is_punctuation(character)
216
+ or character in COMMON_SAFE_ASCII_CHARACTERS
217
+ ):
218
+ self._last_printable_seen = None
219
+ return
220
+
221
+ if self._last_printable_seen is None:
222
+ self._last_printable_seen = character
223
+ return
224
+
225
+ unicode_range_a: str | None = unicode_range(self._last_printable_seen)
226
+ unicode_range_b: str | None = unicode_range(character)
227
+
228
+ if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
229
+ self._suspicious_successive_range_count += 1
230
+
231
+ self._last_printable_seen = character
232
+
233
+ def reset(self) -> None: # Abstract
234
+ self._character_count = 0
235
+ self._suspicious_successive_range_count = 0
236
+ self._last_printable_seen = None
237
+
238
+ @property
239
+ def ratio(self) -> float:
240
+ if self._character_count <= 13:
241
+ return 0.0
242
+
243
+ ratio_of_suspicious_range_usage: float = (
244
+ self._suspicious_successive_range_count * 2
245
+ ) / self._character_count
246
+
247
+ return ratio_of_suspicious_range_usage
248
+
249
+
250
+ class SuperWeirdWordPlugin(MessDetectorPlugin):
251
+ def __init__(self) -> None:
252
+ self._word_count: int = 0
253
+ self._bad_word_count: int = 0
254
+ self._foreign_long_count: int = 0
255
+
256
+ self._is_current_word_bad: bool = False
257
+ self._foreign_long_watch: bool = False
258
+
259
+ self._character_count: int = 0
260
+ self._bad_character_count: int = 0
261
+
262
+ self._buffer: str = ""
263
+ self._buffer_accent_count: int = 0
264
+ self._buffer_glyph_count: int = 0
265
+
266
+ def eligible(self, character: str) -> bool:
267
+ return True
268
+
269
+ def feed(self, character: str) -> None:
270
+ if character.isalpha():
271
+ self._buffer += character
272
+ if is_accentuated(character):
273
+ self._buffer_accent_count += 1
274
+ if (
275
+ self._foreign_long_watch is False
276
+ and (is_latin(character) is False or is_accentuated(character))
277
+ and is_cjk(character) is False
278
+ and is_hangul(character) is False
279
+ and is_katakana(character) is False
280
+ and is_hiragana(character) is False
281
+ and is_thai(character) is False
282
+ ):
283
+ self._foreign_long_watch = True
284
+ if (
285
+ is_cjk(character)
286
+ or is_hangul(character)
287
+ or is_katakana(character)
288
+ or is_hiragana(character)
289
+ or is_thai(character)
290
+ ):
291
+ self._buffer_glyph_count += 1
292
+ return
293
+ if not self._buffer:
294
+ return
295
+ if (
296
+ character.isspace() or is_punctuation(character) or is_separator(character)
297
+ ) and self._buffer:
298
+ self._word_count += 1
299
+ buffer_length: int = len(self._buffer)
300
+
301
+ self._character_count += buffer_length
302
+
303
+ if buffer_length >= 4:
304
+ if self._buffer_accent_count / buffer_length >= 0.5:
305
+ self._is_current_word_bad = True
306
+ # Word/Buffer ending with an upper case accentuated letter are so rare,
307
+ # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
308
+ elif (
309
+ is_accentuated(self._buffer[-1])
310
+ and self._buffer[-1].isupper()
311
+ and all(_.isupper() for _ in self._buffer) is False
312
+ ):
313
+ self._foreign_long_count += 1
314
+ self._is_current_word_bad = True
315
+ elif self._buffer_glyph_count == 1:
316
+ self._is_current_word_bad = True
317
+ self._foreign_long_count += 1
318
+ if buffer_length >= 24 and self._foreign_long_watch:
319
+ camel_case_dst = [
320
+ i
321
+ for c, i in zip(self._buffer, range(0, buffer_length))
322
+ if c.isupper()
323
+ ]
324
+ probable_camel_cased: bool = False
325
+
326
+ if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
327
+ probable_camel_cased = True
328
+
329
+ if not probable_camel_cased:
330
+ self._foreign_long_count += 1
331
+ self._is_current_word_bad = True
332
+
333
+ if self._is_current_word_bad:
334
+ self._bad_word_count += 1
335
+ self._bad_character_count += len(self._buffer)
336
+ self._is_current_word_bad = False
337
+
338
+ self._foreign_long_watch = False
339
+ self._buffer = ""
340
+ self._buffer_accent_count = 0
341
+ self._buffer_glyph_count = 0
342
+ elif (
343
+ character not in {"<", ">", "-", "=", "~", "|", "_"}
344
+ and character.isdigit() is False
345
+ and is_symbol(character)
346
+ ):
347
+ self._is_current_word_bad = True
348
+ self._buffer += character
349
+
350
+ def reset(self) -> None: # Abstract
351
+ self._buffer = ""
352
+ self._is_current_word_bad = False
353
+ self._foreign_long_watch = False
354
+ self._bad_word_count = 0
355
+ self._word_count = 0
356
+ self._character_count = 0
357
+ self._bad_character_count = 0
358
+ self._foreign_long_count = 0
359
+
360
+ @property
361
+ def ratio(self) -> float:
362
+ if self._word_count <= 10 and self._foreign_long_count == 0:
363
+ return 0.0
364
+
365
+ return self._bad_character_count / self._character_count
366
+
367
+
368
+ class CjkInvalidStopPlugin(MessDetectorPlugin):
369
+ """
370
+ GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
371
+ can be easily detected. Searching for the overuse of '丅' and '丄'.
372
+ """
373
+
374
+ def __init__(self) -> None:
375
+ self._wrong_stop_count: int = 0
376
+ self._cjk_character_count: int = 0
377
+
378
+ def eligible(self, character: str) -> bool:
379
+ return True
380
+
381
+ def feed(self, character: str) -> None:
382
+ if character in {"丅", "丄"}:
383
+ self._wrong_stop_count += 1
384
+ return
385
+ if is_cjk(character):
386
+ self._cjk_character_count += 1
387
+
388
+ def reset(self) -> None: # Abstract
389
+ self._wrong_stop_count = 0
390
+ self._cjk_character_count = 0
391
+
392
+ @property
393
+ def ratio(self) -> float:
394
+ if self._cjk_character_count < 16:
395
+ return 0.0
396
+ return self._wrong_stop_count / self._cjk_character_count
397
+
398
+
399
+ class ArchaicUpperLowerPlugin(MessDetectorPlugin):
400
+ def __init__(self) -> None:
401
+ self._buf: bool = False
402
+
403
+ self._character_count_since_last_sep: int = 0
404
+
405
+ self._successive_upper_lower_count: int = 0
406
+ self._successive_upper_lower_count_final: int = 0
407
+
408
+ self._character_count: int = 0
409
+
410
+ self._last_alpha_seen: str | None = None
411
+ self._current_ascii_only: bool = True
412
+
413
+ def eligible(self, character: str) -> bool:
414
+ return True
415
+
416
+ def feed(self, character: str) -> None:
417
+ is_concerned = character.isalpha() and is_case_variable(character)
418
+ chunk_sep = is_concerned is False
419
+
420
+ if chunk_sep and self._character_count_since_last_sep > 0:
421
+ if (
422
+ self._character_count_since_last_sep <= 64
423
+ and character.isdigit() is False
424
+ and self._current_ascii_only is False
425
+ ):
426
+ self._successive_upper_lower_count_final += (
427
+ self._successive_upper_lower_count
428
+ )
429
+
430
+ self._successive_upper_lower_count = 0
431
+ self._character_count_since_last_sep = 0
432
+ self._last_alpha_seen = None
433
+ self._buf = False
434
+ self._character_count += 1
435
+ self._current_ascii_only = True
436
+
437
+ return
438
+
439
+ if self._current_ascii_only is True and character.isascii() is False:
440
+ self._current_ascii_only = False
441
+
442
+ if self._last_alpha_seen is not None:
443
+ if (character.isupper() and self._last_alpha_seen.islower()) or (
444
+ character.islower() and self._last_alpha_seen.isupper()
445
+ ):
446
+ if self._buf is True:
447
+ self._successive_upper_lower_count += 2
448
+ self._buf = False
449
+ else:
450
+ self._buf = True
451
+ else:
452
+ self._buf = False
453
+
454
+ self._character_count += 1
455
+ self._character_count_since_last_sep += 1
456
+ self._last_alpha_seen = character
457
+
458
+ def reset(self) -> None: # Abstract
459
+ self._character_count = 0
460
+ self._character_count_since_last_sep = 0
461
+ self._successive_upper_lower_count = 0
462
+ self._successive_upper_lower_count_final = 0
463
+ self._last_alpha_seen = None
464
+ self._buf = False
465
+ self._current_ascii_only = True
466
+
467
+ @property
468
+ def ratio(self) -> float:
469
+ if self._character_count == 0:
470
+ return 0.0
471
+
472
+ return self._successive_upper_lower_count_final / self._character_count
473
+
474
+
475
+ class ArabicIsolatedFormPlugin(MessDetectorPlugin):
476
+ def __init__(self) -> None:
477
+ self._character_count: int = 0
478
+ self._isolated_form_count: int = 0
479
+
480
+ def reset(self) -> None: # Abstract
481
+ self._character_count = 0
482
+ self._isolated_form_count = 0
483
+
484
+ def eligible(self, character: str) -> bool:
485
+ return is_arabic(character)
486
+
487
+ def feed(self, character: str) -> None:
488
+ self._character_count += 1
489
+
490
+ if is_arabic_isolated_form(character):
491
+ self._isolated_form_count += 1
492
+
493
+ @property
494
+ def ratio(self) -> float:
495
+ if self._character_count < 8:
496
+ return 0.0
497
+
498
+ isolated_form_usage: float = self._isolated_form_count / self._character_count
499
+
500
+ return isolated_form_usage
501
+
502
+
503
+ @lru_cache(maxsize=1024)
504
+ def is_suspiciously_successive_range(
505
+ unicode_range_a: str | None, unicode_range_b: str | None
506
+ ) -> bool:
507
+ """
508
+ Determine if two Unicode range seen next to each other can be considered as suspicious.
509
+ """
510
+ if unicode_range_a is None or unicode_range_b is None:
511
+ return True
512
+
513
+ if unicode_range_a == unicode_range_b:
514
+ return False
515
+
516
+ if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
517
+ return False
518
+
519
+ if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
520
+ return False
521
+
522
+ # Latin characters can be accompanied with a combining diacritical mark
523
+ # eg. Vietnamese.
524
+ if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
525
+ "Combining" in unicode_range_a or "Combining" in unicode_range_b
526
+ ):
527
+ return False
528
+
529
+ keywords_range_a, keywords_range_b = (
530
+ unicode_range_a.split(" "),
531
+ unicode_range_b.split(" "),
532
+ )
533
+
534
+ for el in keywords_range_a:
535
+ if el in UNICODE_SECONDARY_RANGE_KEYWORD:
536
+ continue
537
+ if el in keywords_range_b:
538
+ return False
539
+
540
+ # Japanese Exception
541
+ range_a_jp_chars, range_b_jp_chars = (
542
+ unicode_range_a
543
+ in (
544
+ "Hiragana",
545
+ "Katakana",
546
+ ),
547
+ unicode_range_b in ("Hiragana", "Katakana"),
548
+ )
549
+ if (range_a_jp_chars or range_b_jp_chars) and (
550
+ "CJK" in unicode_range_a or "CJK" in unicode_range_b
551
+ ):
552
+ return False
553
+ if range_a_jp_chars and range_b_jp_chars:
554
+ return False
555
+
556
+ if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
557
+ if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
558
+ return False
559
+ if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
560
+ return False
561
+
562
+ # Chinese/Japanese use dedicated range for punctuation and/or separators.
563
+ if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
564
+ unicode_range_a in ["Katakana", "Hiragana"]
565
+ and unicode_range_b in ["Katakana", "Hiragana"]
566
+ ):
567
+ if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
568
+ return False
569
+ if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
570
+ return False
571
+ if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
572
+ return False
573
+
574
+ return True
575
+
576
+
577
+ @lru_cache(maxsize=2048)
578
+ def mess_ratio(
579
+ decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
580
+ ) -> float:
581
+ """
582
+ Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
583
+ """
584
+
585
+ detectors: list[MessDetectorPlugin] = [
586
+ md_class() for md_class in MessDetectorPlugin.__subclasses__()
587
+ ]
588
+
589
+ length: int = len(decoded_sequence) + 1
590
+
591
+ mean_mess_ratio: float = 0.0
592
+
593
+ if length < 512:
594
+ intermediary_mean_mess_ratio_calc: int = 32
595
+ elif length <= 1024:
596
+ intermediary_mean_mess_ratio_calc = 64
597
+ else:
598
+ intermediary_mean_mess_ratio_calc = 128
599
+
600
+ for character, index in zip(decoded_sequence + "\n", range(length)):
601
+ for detector in detectors:
602
+ if detector.eligible(character):
603
+ detector.feed(character)
604
+
605
+ if (
606
+ index > 0 and index % intermediary_mean_mess_ratio_calc == 0
607
+ ) or index == length - 1:
608
+ mean_mess_ratio = sum(dt.ratio for dt in detectors)
609
+
610
+ if mean_mess_ratio >= maximum_threshold:
611
+ break
612
+
613
+ if debug:
614
+ logger = getLogger("charset_normalizer")
615
+
616
+ logger.log(
617
+ TRACE,
618
+ "Mess-detector extended-analysis start. "
619
+ f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
620
+ f"maximum_threshold={maximum_threshold}",
621
+ )
622
+
623
+ if len(decoded_sequence) > 16:
624
+ logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
625
+ logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
626
+
627
+ for dt in detectors:
628
+ logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
629
+
630
+ return round(mean_mess_ratio, 3)