| 
									
										
										
										
											2024-09-11 09:29:51 +08:00
										 |  |  | # Licensed to the Apache Software Foundation (ASF) under one | 
					
						
							|  |  |  | # or more contributor license agreements.  See the NOTICE file | 
					
						
							|  |  |  | # distributed with this work for additional information | 
					
						
							|  |  |  | # regarding copyright ownership.  The ASF licenses this file | 
					
						
							|  |  |  | # to you under the Apache License, Version 2.0 (the | 
					
						
							|  |  |  | # "License"); you may not use this file except in compliance | 
					
						
							|  |  |  | # with the License.  You may obtain a copy of the License at | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #   http://www.apache.org/licenses/LICENSE-2.0 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Unless required by applicable law or agreed to in writing, | 
					
						
							|  |  |  | # software distributed under the License is distributed on an | 
					
						
							|  |  |  | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | 
					
						
							|  |  |  | # KIND, either express or implied.  See the License for the | 
					
						
							|  |  |  | # specific language governing permissions and limitations | 
					
						
							|  |  |  | # under the License. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | """
 | 
					
						
							|  |  |  | This script automates the process of fetching contributor data from GitHub | 
					
						
							|  |  |  | repositories, filtering top contributors who are not part of the existing | 
					
						
							|  |  |  | committers, and updating a local configuration file (.asf.yaml) to include these | 
					
						
							|  |  |  | new contributors. | 
					
						
							|  |  |  | """
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import io | 
					
						
							|  |  |  | import logging | 
					
						
							|  |  |  | import os | 
					
						
							|  |  |  | from datetime import datetime, timedelta | 
					
						
							|  |  |  | from typing import Dict, List, Tuple | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | from bs4 import BeautifulSoup | 
					
						
							|  |  |  | from github import Github | 
					
						
							|  |  |  | from github.Commit import Commit | 
					
						
							|  |  |  | from github.ContentFile import ContentFile | 
					
						
							|  |  |  | from github.PaginatedList import PaginatedList | 
					
						
							|  |  |  | from github.Repository import Repository | 
					
						
							|  |  |  | from ruamel.yaml import YAML | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | logging.basicConfig( | 
					
						
							|  |  |  |     format="%(asctime)s %(levelname)s %(message)s", | 
					
						
							|  |  |  |     level=logging.INFO, | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | GITHUB_TOKEN: str = os.getenv("GITHUB_TOKEN") | 
					
						
							|  |  |  | REPO_KAFKA_SITE: str = "apache/kafka-site" | 
					
						
							|  |  |  | REPO_KAFKA: str = "apache/kafka" | 
					
						
							|  |  |  | ASF_YAML_PATH: str = "../.asf.yaml" | 
					
						
							|  |  |  | TOP_N_CONTRIBUTORS: int = 10 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def get_github_client() -> Github: | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Initialize GitHub client with token. | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     if not GITHUB_TOKEN: | 
					
						
							|  |  |  |         logging.error("GITHUB_TOKEN is not set in the environment") | 
					
						
							|  |  |  |         raise ValueError("GITHUB_TOKEN is not set in the environment") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     logging.info("Successfully initialized GitHub client") | 
					
						
							|  |  |  |     return Github(GITHUB_TOKEN) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def get_committers_list(repo: Repository) -> List[str]: | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Fetch the committers from the given repository. | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     logging.info(f"Fetching committers from the repository {REPO_KAFKA_SITE}") | 
					
						
							|  |  |  |     committers_file: ContentFile = repo.get_contents("committers.html") | 
					
						
							|  |  |  |     content: bytes = committers_file.decoded_content | 
					
						
							|  |  |  |     soup: BeautifulSoup = BeautifulSoup(content, "html.parser") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     committers = [login.text for login in soup.find_all("div", class_="github_login")] | 
					
						
							|  |  |  |     logging.info(f"Found {len(committers)} committers") | 
					
						
							|  |  |  |     return committers | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def get_top_contributors(repo: Repository, committers: List[str]) -> List[str]: | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Get top contributors for the given repository excluding committers. | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     logging.info(f"Fetching contributors from the repository {REPO_KAFKA}") | 
					
						
							|  |  |  |     one_year_ago: datetime = datetime.now() - timedelta(days=365) | 
					
						
							|  |  |  |     contributors: Dict[str, int] = {} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     last_year_commits: PaginatedList[Commit] = repo.get_commits(since=one_year_ago) | 
					
						
							|  |  |  |     for contributor in repo.get_contributors(): | 
					
						
							|  |  |  |         if contributor.login not in committers: | 
					
						
							|  |  |  |             contributions: int = 0 | 
					
						
							|  |  |  |             for commit in last_year_commits: | 
					
						
							|  |  |  |                 if commit.author == contributor: | 
					
						
							|  |  |  |                     contributions += 1 | 
					
						
							|  |  |  |             contributors[contributor.login] = contributions | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     sorted_contributors: List[Tuple[str, int]] = sorted( | 
					
						
							|  |  |  |         contributors.items(), key=lambda x: x[1], reverse=True | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     top_contributors = [login for login, _ in sorted_contributors][:TOP_N_CONTRIBUTORS] | 
					
						
							|  |  |  |     logging.info( | 
					
						
							|  |  |  |         f"Found {len(top_contributors)} top contributors who are not committers" | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     return top_contributors | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def update_local_yaml_content(yaml_file_path: str, collaborators: List[str]) -> None: | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Update the local .asf.yaml file with refreshed GitHub whitelist and | 
					
						
							|  |  |  |     collaborators. | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     logging.info( | 
					
						
							|  |  |  |         f"Updating {yaml_file_path} with {len(collaborators)} new collaborators" | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-02 22:25:02 +08:00
										 |  |  |     collaborators.sort(key=str.casefold) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-11 09:29:51 +08:00
										 |  |  |     with open(yaml_file_path, "r", encoding="utf-8") as file: | 
					
						
							|  |  |  |         yaml: YAML = YAML() | 
					
						
							| 
									
										
										
										
											2024-10-02 22:25:02 +08:00
										 |  |  |         yaml.indent(mapping=2, sequence=4, offset=2) | 
					
						
							| 
									
										
										
										
											2024-09-11 09:29:51 +08:00
										 |  |  |         yaml_content: dict = yaml.load(file) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-02 22:25:02 +08:00
										 |  |  |     yaml_content["github"]["collaborators"] = collaborators | 
					
						
							| 
									
										
										
										
											2024-09-11 09:29:51 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     with open(yaml_file_path, "w", encoding="utf-8") as file: | 
					
						
							|  |  |  |         yaml.dump(yaml_content, file) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     logging.info(f"Local file {yaml_file_path} updated successfully") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def main() -> None: | 
					
						
							|  |  |  |     github_client: Github = get_github_client() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     kafka_site_repo: Repository = github_client.get_repo(REPO_KAFKA_SITE) | 
					
						
							|  |  |  |     committers: List[str] = get_committers_list(kafka_site_repo) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     kafka_repo: Repository = github_client.get_repo(REPO_KAFKA) | 
					
						
							|  |  |  |     top_contributors: List[str] = get_top_contributors(kafka_repo, committers) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     update_local_yaml_content(ASF_YAML_PATH, top_contributors) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | if __name__ == "__main__": | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         main() | 
					
						
							|  |  |  |     except Exception as e: | 
					
						
							|  |  |  |         logging.error(f"Error: {e}") |