KAFKA-14995: Automate asf.yaml collaborators refresh (#17124)

Add a Python script that analyzes our Git history to find top contributors. This can be used by committers to update
the list of contributors in .asf.yaml without a lot of tedious effort. 

Co-authored-by: stevenbooke <steviebeee55@gmail.com>
Co-authored-by: Joao Pedro Fonseca <fonsdant@gmail.com>
Reviewers: David Arthur <mumrah@gmail.com>
This commit is contained in:
João Pedro Fonseca Dantas 2024-09-10 22:29:51 -03:00 committed by GitHub
parent 6fd973b4a5
commit 794e9a4a52
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 233 additions and 0 deletions

70
committer-tools/README.md Normal file
View File

@ -0,0 +1,70 @@
# Refresh Collaborators Script
The Refresh Collaborators script automates the process of fetching contributor
data from GitHub repositories, filtering top contributors who are not part of
the existing committers, and updating a local configuration file (.asf.yaml) to
include these new contributors.
## Table of Contents
- [Requirements](#requirements)
- [Installation](#installation)
- [Usage](#usage)
## Requirements
- Python 3.x and pip
- A valid GitHub token with repository read access
## Installation
### 1. Check Python installation
Check if Python and pip are installed in your system.
```bash
python3 --version
pip3 --version
```
### 2. Set up a virtual environment (optional)
```bash
python3 -m venv venv
# For Linux/macOS
source venv/bin/activate
# On Windows:
# .\venv\Scripts\activate
```
3. Install the required dependencies
```bash
pip3 install -r requirements.txt
```
## Usage
### 1. Set up the environment variable for GitHub Token
You need to set up a valid GitHub token to access the repository. After you
generate it (or authenticate via GitHub CLI), this can be done by setting the
GITHUB_TOKEN environment variable.
```bash
# For Linux/macOS
export GITHUB_TOKEN="your_github_token"
# Or if you use GitHub CLI
export GITHUB_TOKEN="$(gh auth token)"
# On Windows:
# .\venv\Scripts\activate
```
### 2. Run the script
```bash
python3 refresh_collaborators.py
```

View File

@ -0,0 +1,143 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
This script automates the process of fetching contributor data from GitHub
repositories, filtering top contributors who are not part of the existing
committers, and updating a local configuration file (.asf.yaml) to include these
new contributors.
"""
import io
import logging
import os
from datetime import datetime, timedelta
from typing import Dict, List, Tuple
from bs4 import BeautifulSoup
from github import Github
from github.Commit import Commit
from github.ContentFile import ContentFile
from github.PaginatedList import PaginatedList
from github.Repository import Repository
from ruamel.yaml import YAML
logging.basicConfig(
format="%(asctime)s %(levelname)s %(message)s",
level=logging.INFO,
)
GITHUB_TOKEN: str = os.getenv("GITHUB_TOKEN")
REPO_KAFKA_SITE: str = "apache/kafka-site"
REPO_KAFKA: str = "apache/kafka"
ASF_YAML_PATH: str = "../.asf.yaml"
TOP_N_CONTRIBUTORS: int = 10
def get_github_client() -> Github:
"""
Initialize GitHub client with token.
"""
if not GITHUB_TOKEN:
logging.error("GITHUB_TOKEN is not set in the environment")
raise ValueError("GITHUB_TOKEN is not set in the environment")
logging.info("Successfully initialized GitHub client")
return Github(GITHUB_TOKEN)
def get_committers_list(repo: Repository) -> List[str]:
"""
Fetch the committers from the given repository.
"""
logging.info(f"Fetching committers from the repository {REPO_KAFKA_SITE}")
committers_file: ContentFile = repo.get_contents("committers.html")
content: bytes = committers_file.decoded_content
soup: BeautifulSoup = BeautifulSoup(content, "html.parser")
committers = [login.text for login in soup.find_all("div", class_="github_login")]
logging.info(f"Found {len(committers)} committers")
return committers
def get_top_contributors(repo: Repository, committers: List[str]) -> List[str]:
"""
Get top contributors for the given repository excluding committers.
"""
logging.info(f"Fetching contributors from the repository {REPO_KAFKA}")
one_year_ago: datetime = datetime.now() - timedelta(days=365)
contributors: Dict[str, int] = {}
last_year_commits: PaginatedList[Commit] = repo.get_commits(since=one_year_ago)
for contributor in repo.get_contributors():
if contributor.login not in committers:
contributions: int = 0
for commit in last_year_commits:
if commit.author == contributor:
contributions += 1
contributors[contributor.login] = contributions
sorted_contributors: List[Tuple[str, int]] = sorted(
contributors.items(), key=lambda x: x[1], reverse=True
)
top_contributors = [login for login, _ in sorted_contributors][:TOP_N_CONTRIBUTORS]
logging.info(
f"Found {len(top_contributors)} top contributors who are not committers"
)
return top_contributors
def update_local_yaml_content(yaml_file_path: str, collaborators: List[str]) -> None:
"""
Update the local .asf.yaml file with refreshed GitHub whitelist and
collaborators.
"""
logging.info(
f"Updating {yaml_file_path} with {len(collaborators)} new collaborators"
)
with open(yaml_file_path, "r", encoding="utf-8") as file:
yaml: YAML = YAML()
yaml_content: dict = yaml.load(file)
yaml_content["jenkins"]["github_whitelist"] = collaborators
yaml_content["github"]["collaborators"] = collaborators.copy()
with open(yaml_file_path, "w", encoding="utf-8") as file:
yaml.dump(yaml_content, file)
logging.info(f"Local file {yaml_file_path} updated successfully")
def main() -> None:
github_client: Github = get_github_client()
kafka_site_repo: Repository = github_client.get_repo(REPO_KAFKA_SITE)
committers: List[str] = get_committers_list(kafka_site_repo)
kafka_repo: Repository = github_client.get_repo(REPO_KAFKA)
top_contributors: List[str] = get_top_contributors(kafka_repo, committers)
update_local_yaml_content(ASF_YAML_PATH, top_contributors)
if __name__ == "__main__":
try:
main()
except Exception as e:
logging.error(f"Error: {e}")

View File

@ -0,0 +1,20 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
beautifulsoup4==4.12.3
PyGithub==2.4.0
ruamel.yaml==0.18.6