pyproject.toml

[build-system]
requires = ["setuptools >= 61.0"]
build-backend = "setuptools.build_meta"

[project]
name = "llm-datasets"
version = "0.0.3"
authors = [{ name = "Malte Ostendorff", email = "malte.ostendorff@dfki.de" }]
description = "A collection of datasets for language model training including scripts for downloading, preprocesssing, and sampling."
readme = "README.md"
classifiers = [
    "Development Status :: 3 - Alpha",
    "Intended Audience :: Developers",
    "License :: OSI Approved :: Apache Software License",
    "Programming Language :: Python :: 3 :: Only",
    "Programming Language :: Python :: 3.9",
    "Programming Language :: Python :: 3.10",
    "Programming Language :: Python :: 3.11",
]
requires-python = ">=3.9"
dependencies = [
    "requests",
    "pandas",
    "datasets",
    "smart-open",
    "wget",
    "zstandard",
    "polars",
    "pyarrow>=14.0.1",
    "dill",
]
license = { text = "Apache 2.0" }

[project.scripts]
llm-datasets = "llm_datasets.__main__:main"

[project.optional-dependencies]
datasets = [
    "wget==3.2",
    "mwparserfromhell>=0.6.4",
    "gensim>=4.3.0",
    "zstandard>=0.20.0",
    "treelib>=1.6.4",
    "conllu>=4.5.3",
    "prevert>=1.0.2",
    "translate-toolkit>=3.9.1",
    "folia>=2.5.8",
    "estnltk>=1.7.1",
    "Jinja2",
]
dev = [
    "pre-commit",
    "black",
    "flake8",
    "flake8-pyproject",
    "pytest",
    "pytest-cov",
    "pytest-runner",
    "pytest-xdist",
    "coverage",
    "pylint",
    "twine",
    "jupyter",
    "ruff>=0.0.254",
]
docs = [
    "mkdocs",
    "mkdocs-material",
    "mkdocs-exclude",
    "mkdocstrings",
    "mkdocstrings[python]",
]
viewer = ["streamlit", "ngrok"]
datatrove = ["datatrove[all]>=0.2.0"]

all = [
    "llm-datasets[datasets]",
    "llm-datasets[dev]",
    "llm-datasets[docs]",
    "llm-datasets[viewer]",
    "llm-datasets[datatrove]",

]

[project.urls]
Documentation = "https://github.com/malteos/llm-datasets/tree/main#readme"
Source = "https://github.com/malteos/llm-datasets"
Tracker = "https://github.com/malteos/llm-datasets/issues"

[tool.coverage.run]
branch = true

[tool.coverage.report]
fail_under = 100

[tool.flake8]
max-line-length = 120
select = "F,E,W,B,B901,B902,B903"
exclude = [
    ".eggs",
    ".git",
    ".tox",
    ".github",
    ".vscode",
    "nssm",
    "obj",
    "out",
    "packages",
    "pywin32",
    "tests",
    "swagger_client",
    "dist",
    "images",
    "docs",
]
ignore = ["E722", "B001", "W503", "E203"]

reportMissingImports = true
reportMissingTypeStubs = false

pythonVersion = "3.10"
pythonPlatform = "Linux"

executionEnvironments = [{ root = "src" }]

[tool.pytest.ini_options]
pythonpath = ["src"]
testpaths = "tests"


[tool.ruff]
target-version = "py310"
line-length = 120

[tool.ruff.lint]
select = ["F", "I", "E", "D"]
ignore = [
    "E501", # line too long
    "E741", # ambiguous variable name
    "F403", # undefined import
    "D100", # Missing docstring in public module
    "D101", # Missing docstring in public class
    "D102", # Missing docstring in public method
    "D103", # Missing docstring in public function
    "D105", # Missing docstring in magic method
    "D104", # Missing docstring in public package
    "D107", # Missing docstring in __init__
    "D205", # 1 blank line required between summary line and description
    "D415", # First line should end with a period
]

[tool.ruff.lint.pydocstyle]
convention = "google"

[tool.ruff.lint.flake8-annotations]
mypy-init-return = true
suppress-none-returning = true

[tool.semantic_release]
branch = "main"
version_toml = ["pyproject.toml:project.version"]
build_command = "python -m pip install build; python -m build"
tag_format = "{version}"

[tool.semantic_release.commit_parser_options]
major_types = ["breaking"]
minor_types = ["feat"]
patch_types = ["fix", "perf"]