mirror of https://github.com/apache/kafka.git
KAFKA-14995: Automate asf.yaml collaborators refresh (#17124)
Add a Python script that analyzes our Git history to find top contributors. This can be used by committers to update the list of contributors in .asf.yaml without a lot of tedious effort. Co-authored-by: stevenbooke <steviebeee55@gmail.com> Co-authored-by: Joao Pedro Fonseca <fonsdant@gmail.com> Reviewers: David Arthur <mumrah@gmail.com>
This commit is contained in:
parent
6fd973b4a5
commit
794e9a4a52
|
@ -0,0 +1,70 @@
|
||||||
|
# Refresh Collaborators Script
|
||||||
|
|
||||||
|
The Refresh Collaborators script automates the process of fetching contributor
|
||||||
|
data from GitHub repositories, filtering top contributors who are not part of
|
||||||
|
the existing committers, and updating a local configuration file (.asf.yaml) to
|
||||||
|
include these new contributors.
|
||||||
|
|
||||||
|
## Table of Contents
|
||||||
|
|
||||||
|
- [Requirements](#requirements)
|
||||||
|
- [Installation](#installation)
|
||||||
|
- [Usage](#usage)
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- Python 3.x and pip
|
||||||
|
- A valid GitHub token with repository read access
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
### 1. Check Python installation
|
||||||
|
|
||||||
|
Check if Python and pip are installed in your system.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 --version
|
||||||
|
pip3 --version
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Set up a virtual environment (optional)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 -m venv venv
|
||||||
|
|
||||||
|
# For Linux/macOS
|
||||||
|
source venv/bin/activate
|
||||||
|
|
||||||
|
# On Windows:
|
||||||
|
# .\venv\Scripts\activate
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Install the required dependencies
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip3 install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### 1. Set up the environment variable for GitHub Token
|
||||||
|
|
||||||
|
You need to set up a valid GitHub token to access the repository. After you
|
||||||
|
generate it (or authenticate via GitHub CLI), this can be done by setting the
|
||||||
|
GITHUB_TOKEN environment variable.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# For Linux/macOS
|
||||||
|
export GITHUB_TOKEN="your_github_token"
|
||||||
|
# Or if you use GitHub CLI
|
||||||
|
export GITHUB_TOKEN="$(gh auth token)"
|
||||||
|
|
||||||
|
# On Windows:
|
||||||
|
# .\venv\Scripts\activate
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Run the script
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 refresh_collaborators.py
|
||||||
|
```
|
|
@ -0,0 +1,143 @@
|
||||||
|
# Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
# or more contributor license agreements. See the NOTICE file
|
||||||
|
# distributed with this work for additional information
|
||||||
|
# regarding copyright ownership. The ASF licenses this file
|
||||||
|
# to you under the Apache License, Version 2.0 (the
|
||||||
|
# "License"); you may not use this file except in compliance
|
||||||
|
# with the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing,
|
||||||
|
# software distributed under the License is distributed on an
|
||||||
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, either express or implied. See the License for the
|
||||||
|
# specific language governing permissions and limitations
|
||||||
|
# under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
This script automates the process of fetching contributor data from GitHub
|
||||||
|
repositories, filtering top contributors who are not part of the existing
|
||||||
|
committers, and updating a local configuration file (.asf.yaml) to include these
|
||||||
|
new contributors.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import io
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from typing import Dict, List, Tuple
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from github import Github
|
||||||
|
from github.Commit import Commit
|
||||||
|
from github.ContentFile import ContentFile
|
||||||
|
from github.PaginatedList import PaginatedList
|
||||||
|
from github.Repository import Repository
|
||||||
|
from ruamel.yaml import YAML
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
format="%(asctime)s %(levelname)s %(message)s",
|
||||||
|
level=logging.INFO,
|
||||||
|
)
|
||||||
|
|
||||||
|
GITHUB_TOKEN: str = os.getenv("GITHUB_TOKEN")
|
||||||
|
REPO_KAFKA_SITE: str = "apache/kafka-site"
|
||||||
|
REPO_KAFKA: str = "apache/kafka"
|
||||||
|
ASF_YAML_PATH: str = "../.asf.yaml"
|
||||||
|
TOP_N_CONTRIBUTORS: int = 10
|
||||||
|
|
||||||
|
|
||||||
|
def get_github_client() -> Github:
|
||||||
|
"""
|
||||||
|
Initialize GitHub client with token.
|
||||||
|
"""
|
||||||
|
if not GITHUB_TOKEN:
|
||||||
|
logging.error("GITHUB_TOKEN is not set in the environment")
|
||||||
|
raise ValueError("GITHUB_TOKEN is not set in the environment")
|
||||||
|
|
||||||
|
logging.info("Successfully initialized GitHub client")
|
||||||
|
return Github(GITHUB_TOKEN)
|
||||||
|
|
||||||
|
|
||||||
|
def get_committers_list(repo: Repository) -> List[str]:
|
||||||
|
"""
|
||||||
|
Fetch the committers from the given repository.
|
||||||
|
"""
|
||||||
|
logging.info(f"Fetching committers from the repository {REPO_KAFKA_SITE}")
|
||||||
|
committers_file: ContentFile = repo.get_contents("committers.html")
|
||||||
|
content: bytes = committers_file.decoded_content
|
||||||
|
soup: BeautifulSoup = BeautifulSoup(content, "html.parser")
|
||||||
|
|
||||||
|
committers = [login.text for login in soup.find_all("div", class_="github_login")]
|
||||||
|
logging.info(f"Found {len(committers)} committers")
|
||||||
|
return committers
|
||||||
|
|
||||||
|
|
||||||
|
def get_top_contributors(repo: Repository, committers: List[str]) -> List[str]:
|
||||||
|
"""
|
||||||
|
Get top contributors for the given repository excluding committers.
|
||||||
|
"""
|
||||||
|
logging.info(f"Fetching contributors from the repository {REPO_KAFKA}")
|
||||||
|
one_year_ago: datetime = datetime.now() - timedelta(days=365)
|
||||||
|
contributors: Dict[str, int] = {}
|
||||||
|
|
||||||
|
last_year_commits: PaginatedList[Commit] = repo.get_commits(since=one_year_ago)
|
||||||
|
for contributor in repo.get_contributors():
|
||||||
|
if contributor.login not in committers:
|
||||||
|
contributions: int = 0
|
||||||
|
for commit in last_year_commits:
|
||||||
|
if commit.author == contributor:
|
||||||
|
contributions += 1
|
||||||
|
contributors[contributor.login] = contributions
|
||||||
|
|
||||||
|
sorted_contributors: List[Tuple[str, int]] = sorted(
|
||||||
|
contributors.items(), key=lambda x: x[1], reverse=True
|
||||||
|
)
|
||||||
|
|
||||||
|
top_contributors = [login for login, _ in sorted_contributors][:TOP_N_CONTRIBUTORS]
|
||||||
|
logging.info(
|
||||||
|
f"Found {len(top_contributors)} top contributors who are not committers"
|
||||||
|
)
|
||||||
|
return top_contributors
|
||||||
|
|
||||||
|
|
||||||
|
def update_local_yaml_content(yaml_file_path: str, collaborators: List[str]) -> None:
|
||||||
|
"""
|
||||||
|
Update the local .asf.yaml file with refreshed GitHub whitelist and
|
||||||
|
collaborators.
|
||||||
|
"""
|
||||||
|
logging.info(
|
||||||
|
f"Updating {yaml_file_path} with {len(collaborators)} new collaborators"
|
||||||
|
)
|
||||||
|
|
||||||
|
with open(yaml_file_path, "r", encoding="utf-8") as file:
|
||||||
|
yaml: YAML = YAML()
|
||||||
|
yaml_content: dict = yaml.load(file)
|
||||||
|
|
||||||
|
yaml_content["jenkins"]["github_whitelist"] = collaborators
|
||||||
|
yaml_content["github"]["collaborators"] = collaborators.copy()
|
||||||
|
|
||||||
|
with open(yaml_file_path, "w", encoding="utf-8") as file:
|
||||||
|
yaml.dump(yaml_content, file)
|
||||||
|
|
||||||
|
logging.info(f"Local file {yaml_file_path} updated successfully")
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
github_client: Github = get_github_client()
|
||||||
|
|
||||||
|
kafka_site_repo: Repository = github_client.get_repo(REPO_KAFKA_SITE)
|
||||||
|
committers: List[str] = get_committers_list(kafka_site_repo)
|
||||||
|
|
||||||
|
kafka_repo: Repository = github_client.get_repo(REPO_KAFKA)
|
||||||
|
top_contributors: List[str] = get_top_contributors(kafka_repo, committers)
|
||||||
|
|
||||||
|
update_local_yaml_content(ASF_YAML_PATH, top_contributors)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
try:
|
||||||
|
main()
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Error: {e}")
|
|
@ -0,0 +1,20 @@
|
||||||
|
# Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
# or more contributor license agreements. See the NOTICE file
|
||||||
|
# distributed with this work for additional information
|
||||||
|
# regarding copyright ownership. The ASF licenses this file
|
||||||
|
# to you under the Apache License, Version 2.0 (the
|
||||||
|
# "License"); you may not use this file except in compliance
|
||||||
|
# with the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing,
|
||||||
|
# software distributed under the License is distributed on an
|
||||||
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, either express or implied. See the License for the
|
||||||
|
# specific language governing permissions and limitations
|
||||||
|
# under the License.
|
||||||
|
|
||||||
|
beautifulsoup4==4.12.3
|
||||||
|
PyGithub==2.4.0
|
||||||
|
ruamel.yaml==0.18.6
|
Loading…
Reference in New Issue