[build-system] requires = ["hatchling"] build-backend = "hatchling.build" [project] name = "gte-qwen2-7b-instruct-m2v" version = "0.1.0" description = "Model2Vec distillation pipeline for gte-Qwen2-7B-instruct" readme = "README.md" requires-python = ">=3.12" dependencies = [ "accelerate>=1.7.0", "beam-client>=0.2.155", "boto3>=1.38.23", "datasets>=3.6.0", "dotenv>=0.9.9", "editables>=0.5", "einops>=0.8.1", "flash-attn>=2.7.4.post1", "hatchling>=1.27.0", "iso639>=0.1.4", "jinja2>=3.0.0", "joblib>=1.0.0", "kaleido==1.0.0rc13", "lightning>=2.5.1.post0", "matplotlib>=3.10.3", "more-itertools>=10.5.0", "mteb>=1.14.15", "numpy>=1.26.4", "plotly>=6.1.1", "psutil>=7.0.0", "pydantic>=2.11.5", "requests>=2.32.3", "rich>=10.0.0", "safetensors>=0.3.0", "scikit-learn>=1.6.1", "seaborn>=0.13.2", "sentence-transformers>=4.1.0", "setuptools>=80.8.0", "skops>=0.11.0", "smart-open[s3]>=7.1.0", "statsmodels>=0.14.4", "tokenizers>=0.20", "torch>=2.7.0", "transformers<=4.52.1", "tqdm>=4.65.0", "typer>=0.16.0", ] [project.scripts] distiller = "distiller.__main__:app" [dependency-groups] dev = [ "mypy>=1.15.0", "ruff>=0.11.6", ] [tool.hatch.build.targets.wheel] packages = ["src/distiller"] [tool.mypy] exclude = [ ".git", ".ruff_cache", ".venv", "venv", "__pycache__", "build", "dist", "vendor", ] follow_untyped_imports = true [tool.ruff] line-length = 120 target-version = "py312" # Exclude files/directories exclude = [ ".git", ".ruff_cache", ".venv", "venv", "__pycache__", "build", "dist", "vendor", "src/distiller/model2vec", "src/distiller/tokenlearn" ] [tool.ruff.lint] # Enable all rules by default, then selectively disable select = ["ALL"] ignore = [ # Rules that conflict with other tools/preferences "D203", # one-blank-line-before-class "D212", # multi-line-summary-first-line "FBT001", # Boolean positional arg in function definition (required for typer) "FBT002", # Boolean default value in function definition (required for typer) "C901", # function too complex "PLR0911", # too many return statements "PLR0912", # too many branches "PLR0913", # too many arguments in function definition "PLR0915", # too many statements "TRY300", # Consider moving this statement to an `else` block "COM812", # Use a constant for the message in a raise statement "TC001", # Move application import into a type-checking block "ERA001", # Found commented-out code "G004", # Logging statement uses f-string "TD003", # Missing link in to-do "TRY301", # Abstract raise to an inner function # Disable rules that conflict with tab indentation "E101", # Indentation contains mixed spaces and tabs "W191", # indentation contains tabs "D206", # indent with spaces, not tabs "PD901", # Avoid using the generic variable name `df` for DataFrames "ANN401", # Dynamically typed expressions (typing.Any) are disallowed "D103", # Missing docstring in public function "BLE001", # Do not catch blind exception: `Exception` "T201", # Use `logger.info` instead of `print` "E501", # Line too long "PLR2004", "RUF001", "D100", # Missing docstring in public module "D101", # Missing docstring in public class ] [tool.ruff.lint.mccabe] max-complexity = 10 [tool.ruff.lint.pylint] max-args = 5 max-branches = 12 max-statements = 50 [tool.ruff.lint.pydocstyle] convention = "google" [tool.ruff.format] quote-style = "double" indent-style = "tab" skip-magic-trailing-comma = false line-ending = "auto"