diff --git a/committer-tools/README.md b/committer-tools/README.md new file mode 100644 index 00000000000..3d012aa5393 --- /dev/null +++ b/committer-tools/README.md @@ -0,0 +1,70 @@ +# Refresh Collaborators Script + +The Refresh Collaborators script automates the process of fetching contributor +data from GitHub repositories, filtering top contributors who are not part of +the existing committers, and updating a local configuration file (.asf.yaml) to +include these new contributors. + +## Table of Contents + +- [Requirements](#requirements) +- [Installation](#installation) +- [Usage](#usage) + +## Requirements + +- Python 3.x and pip +- A valid GitHub token with repository read access + +## Installation + +### 1. Check Python installation + +Check if Python and pip are installed in your system. + +```bash +python3 --version +pip3 --version +``` + +### 2. Set up a virtual environment (optional) + +```bash +python3 -m venv venv + +# For Linux/macOS +source venv/bin/activate + +# On Windows: +# .\venv\Scripts\activate +``` + +3. Install the required dependencies + +```bash +pip3 install -r requirements.txt +``` + +## Usage + +### 1. Set up the environment variable for GitHub Token + +You need to set up a valid GitHub token to access the repository. After you +generate it (or authenticate via GitHub CLI), this can be done by setting the +GITHUB_TOKEN environment variable. + +```bash +# For Linux/macOS +export GITHUB_TOKEN="your_github_token" +# Or if you use GitHub CLI +export GITHUB_TOKEN="$(gh auth token)" + +# On Windows: +# .\venv\Scripts\activate +``` + +### 2. Run the script + +```bash +python3 refresh_collaborators.py +``` diff --git a/committer-tools/refresh_collaborators.py b/committer-tools/refresh_collaborators.py new file mode 100644 index 00000000000..61131275b8e --- /dev/null +++ b/committer-tools/refresh_collaborators.py @@ -0,0 +1,143 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +This script automates the process of fetching contributor data from GitHub +repositories, filtering top contributors who are not part of the existing +committers, and updating a local configuration file (.asf.yaml) to include these +new contributors. +""" + +import io +import logging +import os +from datetime import datetime, timedelta +from typing import Dict, List, Tuple + +from bs4 import BeautifulSoup +from github import Github +from github.Commit import Commit +from github.ContentFile import ContentFile +from github.PaginatedList import PaginatedList +from github.Repository import Repository +from ruamel.yaml import YAML + +logging.basicConfig( + format="%(asctime)s %(levelname)s %(message)s", + level=logging.INFO, +) + +GITHUB_TOKEN: str = os.getenv("GITHUB_TOKEN") +REPO_KAFKA_SITE: str = "apache/kafka-site" +REPO_KAFKA: str = "apache/kafka" +ASF_YAML_PATH: str = "../.asf.yaml" +TOP_N_CONTRIBUTORS: int = 10 + + +def get_github_client() -> Github: + """ + Initialize GitHub client with token. + """ + if not GITHUB_TOKEN: + logging.error("GITHUB_TOKEN is not set in the environment") + raise ValueError("GITHUB_TOKEN is not set in the environment") + + logging.info("Successfully initialized GitHub client") + return Github(GITHUB_TOKEN) + + +def get_committers_list(repo: Repository) -> List[str]: + """ + Fetch the committers from the given repository. + """ + logging.info(f"Fetching committers from the repository {REPO_KAFKA_SITE}") + committers_file: ContentFile = repo.get_contents("committers.html") + content: bytes = committers_file.decoded_content + soup: BeautifulSoup = BeautifulSoup(content, "html.parser") + + committers = [login.text for login in soup.find_all("div", class_="github_login")] + logging.info(f"Found {len(committers)} committers") + return committers + + +def get_top_contributors(repo: Repository, committers: List[str]) -> List[str]: + """ + Get top contributors for the given repository excluding committers. + """ + logging.info(f"Fetching contributors from the repository {REPO_KAFKA}") + one_year_ago: datetime = datetime.now() - timedelta(days=365) + contributors: Dict[str, int] = {} + + last_year_commits: PaginatedList[Commit] = repo.get_commits(since=one_year_ago) + for contributor in repo.get_contributors(): + if contributor.login not in committers: + contributions: int = 0 + for commit in last_year_commits: + if commit.author == contributor: + contributions += 1 + contributors[contributor.login] = contributions + + sorted_contributors: List[Tuple[str, int]] = sorted( + contributors.items(), key=lambda x: x[1], reverse=True + ) + + top_contributors = [login for login, _ in sorted_contributors][:TOP_N_CONTRIBUTORS] + logging.info( + f"Found {len(top_contributors)} top contributors who are not committers" + ) + return top_contributors + + +def update_local_yaml_content(yaml_file_path: str, collaborators: List[str]) -> None: + """ + Update the local .asf.yaml file with refreshed GitHub whitelist and + collaborators. + """ + logging.info( + f"Updating {yaml_file_path} with {len(collaborators)} new collaborators" + ) + + with open(yaml_file_path, "r", encoding="utf-8") as file: + yaml: YAML = YAML() + yaml_content: dict = yaml.load(file) + + yaml_content["jenkins"]["github_whitelist"] = collaborators + yaml_content["github"]["collaborators"] = collaborators.copy() + + with open(yaml_file_path, "w", encoding="utf-8") as file: + yaml.dump(yaml_content, file) + + logging.info(f"Local file {yaml_file_path} updated successfully") + + +def main() -> None: + github_client: Github = get_github_client() + + kafka_site_repo: Repository = github_client.get_repo(REPO_KAFKA_SITE) + committers: List[str] = get_committers_list(kafka_site_repo) + + kafka_repo: Repository = github_client.get_repo(REPO_KAFKA) + top_contributors: List[str] = get_top_contributors(kafka_repo, committers) + + update_local_yaml_content(ASF_YAML_PATH, top_contributors) + + +if __name__ == "__main__": + try: + main() + except Exception as e: + logging.error(f"Error: {e}") diff --git a/committer-tools/requirements.txt b/committer-tools/requirements.txt new file mode 100644 index 00000000000..1a000faf8d4 --- /dev/null +++ b/committer-tools/requirements.txt @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +beautifulsoup4==4.12.3 +PyGithub==2.4.0 +ruamel.yaml==0.18.6