From 2b7cbd3d8b7735f8aa2c35ed1b920d13eb74761a Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Thu, 25 Jun 2026 13:08:21 -0700 Subject: [PATCH 1/2] merge Signed-off-by: Mayank Mishra --- .github/copyright.yaml | 8 ++++++++ .pre-commit-config.yaml | 13 +++++++++++++ Makefile | 1 - lm_engine/data/megatron/__init__.py | 4 ++++ lm_engine/data/megatron/bin.py | 4 ++++ lm_engine/data/megatron/blended_dataset.py | 4 ++++ .../megatron/blended_megatron_dataset_builder.py | 4 ++++ .../megatron/blended_megatron_dataset_config.py | 4 ++++ lm_engine/data/megatron/dtype.py | 2 +- lm_engine/data/megatron/gpt_dataset.py | 4 ++++ lm_engine/data/megatron/indexed_dataset.py | 4 ++++ lm_engine/data/megatron/merge_data.py | 4 ++++ lm_engine/data/megatron/preprocess_data.py | 4 ++++ lm_engine/data/megatron/sampler.py | 4 ++++ lm_engine/data/megatron/utils/__init__.py | 4 ++++ lm_engine/data/megatron/utils/helpers.cpp | 4 ++++ 16 files changed, 70 insertions(+), 2 deletions(-) create mode 100644 .github/copyright.yaml diff --git a/.github/copyright.yaml b/.github/copyright.yaml new file mode 100644 index 000000000..71ae68bbc --- /dev/null +++ b/.github/copyright.yaml @@ -0,0 +1,8 @@ +# ************************************************** +# Copyright (c) 2026, Mayank Mishra +# ************************************************** + +one_date_re: '\bCopyright \(c\) (?P[0-9]{4}), Mayank Mishra\b' +two_date_re: '\bCopyright \(c\) (?P[0-9]{4})-(?P[0-9]{4}), Mayank Mishra\b' +one_date_format: 'Copyright (c) {year}, Mayank Mishra' +two_date_format: 'Copyright (c) {from}-{to}, Mayank Mishra' diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a5959b515..78ebac258 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,6 +3,19 @@ # ************************************************** repos: + - repo: local + hooks: + - id: add-copyright + name: add copyright header + language: python + entry: python tools/copyright.py --repo . --header "Copyright (c) 2026, Mayank Mishra" --no-contributors + types: [python] + pass_filenames: false + - repo: https://github.com/sbrunner/hooks + rev: 1.8.0 + hooks: + - id: copyright-required + types: [python] - repo: https://github.com/PyCQA/autoflake rev: v2.3.1 hooks: diff --git a/Makefile b/Makefile index 5657b90b6..844fa200c 100644 --- a/Makefile +++ b/Makefile @@ -20,5 +20,4 @@ update-precommit: uv run --extra dev --no-default-groups pre-commit autoupdate style: - uv run --extra dev --no-default-groups python tools/copyright.py --repo ./ --exclude copyright-exclude.txt --header "Copyright (c) $$(date +%Y), __authors__" --extra-name "Mayank Mishra" --no-contributors uv run --extra dev --no-default-groups pre-commit run --all-files diff --git a/lm_engine/data/megatron/__init__.py b/lm_engine/data/megatron/__init__.py index d7805b7b9..633e1d1f8 100644 --- a/lm_engine/data/megatron/__init__.py +++ b/lm_engine/data/megatron/__init__.py @@ -1,3 +1,7 @@ +# ************************************************** +# Copyright (c) 2026, Mayank Mishra +# ************************************************** + from .blended_megatron_dataset_builder import build from .blended_megatron_dataset_config import GPTDatasetConfig from .gpt_dataset import GPTDataset diff --git a/lm_engine/data/megatron/bin.py b/lm_engine/data/megatron/bin.py index 17b2c879b..ad1046cdf 100644 --- a/lm_engine/data/megatron/bin.py +++ b/lm_engine/data/megatron/bin.py @@ -1,3 +1,7 @@ +# ************************************************** +# Copyright (c) 2026, Mayank Mishra +# ************************************************** + # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the diff --git a/lm_engine/data/megatron/blended_dataset.py b/lm_engine/data/megatron/blended_dataset.py index 476437cb7..0cddd84a6 100644 --- a/lm_engine/data/megatron/blended_dataset.py +++ b/lm_engine/data/megatron/blended_dataset.py @@ -1,3 +1,7 @@ +# ************************************************** +# Copyright (c) 2026, Mayank Mishra +# ************************************************** + # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. from __future__ import annotations diff --git a/lm_engine/data/megatron/blended_megatron_dataset_builder.py b/lm_engine/data/megatron/blended_megatron_dataset_builder.py index 3feeb40ea..a1c196a2e 100644 --- a/lm_engine/data/megatron/blended_megatron_dataset_builder.py +++ b/lm_engine/data/megatron/blended_megatron_dataset_builder.py @@ -1,3 +1,7 @@ +# ************************************************** +# Copyright (c) 2026, Mayank Mishra +# ************************************************** + # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. from __future__ import annotations diff --git a/lm_engine/data/megatron/blended_megatron_dataset_config.py b/lm_engine/data/megatron/blended_megatron_dataset_config.py index a749fd60f..65ea1af2f 100644 --- a/lm_engine/data/megatron/blended_megatron_dataset_config.py +++ b/lm_engine/data/megatron/blended_megatron_dataset_config.py @@ -1,3 +1,7 @@ +# ************************************************** +# Copyright (c) 2026, Mayank Mishra +# ************************************************** + # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. import logging diff --git a/lm_engine/data/megatron/dtype.py b/lm_engine/data/megatron/dtype.py index 0f6f7cded..17e6c32da 100644 --- a/lm_engine/data/megatron/dtype.py +++ b/lm_engine/data/megatron/dtype.py @@ -1,5 +1,5 @@ # ************************************************** -# Copyright (c) 2025, Mayank Mishra +# Copyright (c) 2026, Mayank Mishra # ************************************************** from __future__ import annotations diff --git a/lm_engine/data/megatron/gpt_dataset.py b/lm_engine/data/megatron/gpt_dataset.py index 32256876d..8c1db12d0 100644 --- a/lm_engine/data/megatron/gpt_dataset.py +++ b/lm_engine/data/megatron/gpt_dataset.py @@ -1,3 +1,7 @@ +# ************************************************** +# Copyright (c) 2026, Mayank Mishra +# ************************************************** + # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. from __future__ import annotations diff --git a/lm_engine/data/megatron/indexed_dataset.py b/lm_engine/data/megatron/indexed_dataset.py index 6c60ac6d0..12cfe7090 100644 --- a/lm_engine/data/megatron/indexed_dataset.py +++ b/lm_engine/data/megatron/indexed_dataset.py @@ -1,3 +1,7 @@ +# ************************************************** +# Copyright (c) 2026, Mayank Mishra +# ************************************************** + # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the diff --git a/lm_engine/data/megatron/merge_data.py b/lm_engine/data/megatron/merge_data.py index b0b82171f..39c557844 100644 --- a/lm_engine/data/megatron/merge_data.py +++ b/lm_engine/data/megatron/merge_data.py @@ -1,3 +1,7 @@ +# ************************************************** +# Copyright (c) 2026, Mayank Mishra +# ************************************************** + from .indexed_dataset import MMapIndexedDataset, MMapIndexedDatasetBuilder, get_bin_path, get_idx_path diff --git a/lm_engine/data/megatron/preprocess_data.py b/lm_engine/data/megatron/preprocess_data.py index 154c4d765..0de6a04de 100644 --- a/lm_engine/data/megatron/preprocess_data.py +++ b/lm_engine/data/megatron/preprocess_data.py @@ -1,3 +1,7 @@ +# ************************************************** +# Copyright (c) 2026, Mayank Mishra +# ************************************************** + # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. from __future__ import annotations diff --git a/lm_engine/data/megatron/sampler.py b/lm_engine/data/megatron/sampler.py index 55fa4627b..e7f24ea31 100644 --- a/lm_engine/data/megatron/sampler.py +++ b/lm_engine/data/megatron/sampler.py @@ -1,3 +1,7 @@ +# ************************************************** +# Copyright (c) 2026, Mayank Mishra +# ************************************************** + from __future__ import annotations diff --git a/lm_engine/data/megatron/utils/__init__.py b/lm_engine/data/megatron/utils/__init__.py index 957b180ba..91e1daf01 100644 --- a/lm_engine/data/megatron/utils/__init__.py +++ b/lm_engine/data/megatron/utils/__init__.py @@ -1,3 +1,7 @@ +# ************************************************** +# Copyright (c) 2026, Mayank Mishra +# ************************************************** + # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. import logging diff --git a/lm_engine/data/megatron/utils/helpers.cpp b/lm_engine/data/megatron/utils/helpers.cpp index 62a63fee0..e8e1330de 100644 --- a/lm_engine/data/megatron/utils/helpers.cpp +++ b/lm_engine/data/megatron/utils/helpers.cpp @@ -1,3 +1,7 @@ +// ************************************************** +// Copyright (c) 2026, Mayank Mishra +// ************************************************** + /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */ /* Helper methods for fast index mapping builds */ From 971c082b77e0087ac689dfbe4be8ca1bd3a36c76 Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Thu, 25 Jun 2026 13:15:52 -0700 Subject: [PATCH 2/2] merge Signed-off-by: Mayank Mishra --- .pre-commit-config.yaml | 9 +++++---- tools/copyright.py | 25 ++++++++++++++++++++----- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 78ebac258..f35d9ad61 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,11 +11,12 @@ repos: entry: python tools/copyright.py --repo . --header "Copyright (c) 2026, Mayank Mishra" --no-contributors types: [python] pass_filenames: false - - repo: https://github.com/sbrunner/hooks - rev: 1.8.0 - hooks: - - id: copyright-required + - id: check-copyright + name: check copyright year + language: python + entry: python tools/copyright.py --repo . --header "Copyright (c) 2026, Mayank Mishra" --no-contributors --check types: [python] + pass_filenames: false - repo: https://github.com/PyCQA/autoflake rev: v2.3.1 hooks: diff --git a/tools/copyright.py b/tools/copyright.py index ea0a33228..9d543c49f 100644 --- a/tools/copyright.py +++ b/tools/copyright.py @@ -14,6 +14,7 @@ parser.add_argument("--header", type=str, required=True) parser.add_argument("--extra-name", type=str, required=False) parser.add_argument("--no-contributors", action="store_true", required=False) +parser.add_argument("--check", action="store_true", required=False) args = parser.parse_args() @@ -121,11 +122,14 @@ def _build_html_header(file: str) -> str: ) -def _check_and_add_copyright_header(file: str, build_header_fn, pattern: re.Pattern) -> None: +def _check_and_add_copyright_header(file: str, build_header_fn, pattern: re.Pattern) -> bool: code = open(file, "r").read() if len(code) == 0: - return + return True + + if args.check: + return bool(pattern.match(code)) header = build_header_fn(file) code_stripped = pattern.sub("", code) @@ -135,6 +139,7 @@ def _check_and_add_copyright_header(file: str, build_header_fn, pattern: re.Patt code = f"{header}{code}" open(file, "w").writelines([code]) + return True def _is_banned(path: str) -> bool: @@ -150,6 +155,7 @@ def _is_banned(path: str) -> bool: directory = os.path.realpath(args.repo) _AUTHOR_MAP = {} if args.no_contributors else _build_author_map(directory) +missing = [] for root, dirs, files in os.walk(directory): if _is_banned(root): continue @@ -160,9 +166,18 @@ def _is_banned(path: str) -> bool: if _is_banned(file): continue + ok = True if any([file.endswith(i) for i in _CPP_LIKE_EXTENSIONS]): - _check_and_add_copyright_header(file, _build_cpp_header, _CPP_PATTERN) + ok = _check_and_add_copyright_header(file, _build_cpp_header, _CPP_PATTERN) elif any([file.endswith(i) for i in _PYTHON_LIKE_EXTENSIONS]): - _check_and_add_copyright_header(file, _build_python_header, _PYTHON_PATTERN) + ok = _check_and_add_copyright_header(file, _build_python_header, _PYTHON_PATTERN) elif any([file.endswith(i) for i in _HTML_LIKE_EXTENSIONS]): - _check_and_add_copyright_header(file, _build_html_header, _HTML_PATTERN) + ok = _check_and_add_copyright_header(file, _build_html_header, _HTML_PATTERN) + + if not ok: + missing.append(os.path.relpath(file, directory)) + +if missing: + for f in sorted(missing): + print(f"No copyright found on '{f}'.") + raise SystemExit(1)