[build-system] | |
requires = ["hatchling"] | |
build-backend = "hatchling.build" | |
[project] | |
name = "gte-qwen2-7b-instruct-m2v" | |
version = "0.1.0" | |
description = "Model2Vec distillation pipeline for gte-Qwen2-7B-instruct" | |
readme = "README.md" | |
requires-python = ">=3.12" | |
dependencies = [ | |
"accelerate>=1.7.0", | |
"beam-client>=0.2.155", | |
"boto3>=1.38.23", | |
"datasets>=3.6.0", | |
"dotenv>=0.9.9", | |
"editables>=0.5", | |
"einops>=0.8.1", | |
"flash-attn>=2.7.4.post1", | |
"hatchling>=1.27.0", | |
"iso639>=0.1.4", | |
"jinja2>=3.0.0", | |
"joblib>=1.0.0", | |
"kaleido==1.0.0rc13", | |
"lightning>=2.5.1.post0", | |
"matplotlib>=3.10.3", | |
"more-itertools>=10.5.0", | |
"mteb>=1.14.15", | |
"numpy>=1.26.4", | |
"plotly>=6.1.1", | |
"psutil>=7.0.0", | |
"pydantic>=2.11.5", | |
"requests>=2.32.3", | |
"rich>=10.0.0", | |
"safetensors>=0.3.0", | |
"scikit-learn>=1.6.1", | |
"seaborn>=0.13.2", | |
"sentence-transformers>=4.1.0", | |
"setuptools>=80.8.0", | |
"skops>=0.11.0", | |
"smart-open[s3]>=7.1.0", | |
"statsmodels>=0.14.4", | |
"tokenizers>=0.20", | |
"torch>=2.7.0", | |
"transformers<=4.52.1", | |
"tqdm>=4.65.0", | |
"typer>=0.16.0", | |
] | |
[project.scripts] | |
distiller = "distiller.__main__:app" | |
[dependency-groups] | |
dev = [ | |
"mypy>=1.15.0", | |
"ruff>=0.11.6", | |
] | |
[tool.hatch.build.targets.wheel] | |
packages = ["src/distiller"] | |
[tool.mypy] | |
exclude = [ | |
".git", | |
".ruff_cache", | |
".venv", | |
"venv", | |
"__pycache__", | |
"build", | |
"dist", | |
"vendor", | |
] | |
follow_untyped_imports = true | |
[tool.ruff] | |
line-length = 120 | |
target-version = "py312" | |
# Exclude files/directories | |
exclude = [ | |
".git", | |
".ruff_cache", | |
".venv", | |
"venv", | |
"__pycache__", | |
"build", | |
"dist", | |
"vendor", | |
"src/distiller/model2vec", | |
"src/distiller/tokenlearn" | |
] | |
[tool.ruff.lint] | |
# Enable all rules by default, then selectively disable | |
select = ["ALL"] | |
ignore = [ | |
# Rules that conflict with other tools/preferences | |
"D203", # one-blank-line-before-class | |
"D212", # multi-line-summary-first-line | |
"FBT001", # Boolean positional arg in function definition (required for typer) | |
"FBT002", # Boolean default value in function definition (required for typer) | |
"C901", # function too complex | |
"PLR0911", # too many return statements | |
"PLR0912", # too many branches | |
"PLR0913", # too many arguments in function definition | |
"PLR0915", # too many statements | |
"TRY300", # Consider moving this statement to an `else` block | |
"COM812", # Use a constant for the message in a raise statement | |
"TC001", # Move application import into a type-checking block | |
"ERA001", # Found commented-out code | |
"G004", # Logging statement uses f-string | |
"TD003", # Missing link in to-do | |
"TRY301", # Abstract raise to an inner function | |
# Disable rules that conflict with tab indentation | |
"E101", # Indentation contains mixed spaces and tabs | |
"W191", # indentation contains tabs | |
"D206", # indent with spaces, not tabs | |
"PD901", # Avoid using the generic variable name `df` for DataFrames | |
"ANN401", # Dynamically typed expressions (typing.Any) are disallowed | |
"D103", # Missing docstring in public function | |
"BLE001", # Do not catch blind exception: `Exception` | |
"T201", # Use `logger.info` instead of `print` | |
"E501", # Line too long | |
"PLR2004", | |
"RUF001", | |
"D100", # Missing docstring in public module | |
"D101", # Missing docstring in public class | |
] | |
[tool.ruff.lint.mccabe] | |
max-complexity = 10 | |
[tool.ruff.lint.pylint] | |
max-args = 5 | |
max-branches = 12 | |
max-statements = 50 | |
[tool.ruff.lint.pydocstyle] | |
convention = "google" | |
[tool.ruff.format] | |
quote-style = "double" | |
indent-style = "tab" | |
skip-magic-trailing-comma = false | |
line-ending = "auto" | |