gwapi/utils/tools.py

import datetime
import json
import logging
import os
import re
import shutil
import sys
from collections import defaultdict
from logging.handlers import RotatingFileHandler
from time import time
from urllib.parse import urlparse, urlunparse

import pytz
import requests
from bs4 import BeautifulSoup
from flask import send_file, make_response
from opencc import OpenCC

import utils.constants as constants
from utils.config import config, resource_path
from utils.types import ChannelData

opencc_t2s = OpenCC("t2s")


def get_logger(path, level=logging.ERROR, init=False):
    """
    get the logger
    """
    os.makedirs(os.path.dirname(path), exist_ok=True)
    os.makedirs(constants.output_dir, exist_ok=True)
    if init and os.path.exists(path):
        os.remove(path)
    handler = RotatingFileHandler(path, encoding="utf-8")
    logger = logging.getLogger(path)
    logger.addHandler(handler)
    logger.setLevel(level)
    return logger


def format_interval(t):
    """
    Formats a number of seconds as a clock time, [H:]MM:SS

    Parameters
    ----------
    t  : int or float
        Number of seconds.
    Returns
    -------
    out  : str
        [H:]MM:SS
    """
    mins, s = divmod(int(t), 60)
    h, m = divmod(mins, 60)
    if h:
        return "{0:d}:{1:02d}:{2:02d}".format(h, m, s)
    else:
        return "{0:02d}:{1:02d}".format(m, s)


def get_pbar_remaining(n=0, total=0, start_time=None):
    """
    Get the remaining time of the progress bar
    """
    try:
        elapsed = time() - start_time
        completed_tasks = n
        if completed_tasks > 0:
            avg_time_per_task = elapsed / completed_tasks
            remaining_tasks = total - completed_tasks
            remaining_time = format_interval(avg_time_per_task * remaining_tasks)
        else:
            remaining_time = "未知"
        return remaining_time
    except Exception as e:
        print(f"Error: {e}")


def update_file(final_file, old_file, copy=False):
    """
    Update the file
    """
    old_file_path = resource_path(old_file, persistent=True)
    final_file_path = resource_path(final_file, persistent=True)
    if os.path.exists(old_file_path):
        if copy:
            shutil.copyfile(old_file_path, final_file_path)
        else:
            os.replace(old_file_path, final_file_path)


def filter_by_date(data):
    """
    Filter by date and limit
    """
    default_recent_days = 30
    use_recent_days = config.recent_days
    if not isinstance(use_recent_days, int) or use_recent_days <= 0:
        use_recent_days = default_recent_days
    start_date = datetime.datetime.now() - datetime.timedelta(days=use_recent_days)
    recent_data = []
    unrecent_data = []
    for info, response_time in data:
        item = (info, response_time)
        date = info["date"]
        if date:
            date = datetime.datetime.strptime(date, "%m-%d-%Y")
            if date >= start_date:
                recent_data.append(item)
            else:
                unrecent_data.append(item)
        else:
            unrecent_data.append(item)
    recent_data_len = len(recent_data)
    if recent_data_len == 0:
        recent_data = unrecent_data
    elif recent_data_len < config.urls_limit:
        recent_data.extend(unrecent_data[: config.urls_limit - len(recent_data)])
    return recent_data


def get_soup(source):
    """
    Get soup from source
    """
    source = re.sub(
        r"<!--.*?-->",
        "",
        source,
        flags=re.DOTALL,
    )
    soup = BeautifulSoup(source, "html.parser")
    return soup


def get_resolution_value(resolution_str):
    """
    Get resolution value from string
    """
    try:
        if resolution_str:
            pattern = r"(\d+)[xX*](\d+)"
            match = re.search(pattern, resolution_str)
            if match:
                width, height = map(int, match.groups())
                return width * height
    except:
        pass
    return 0


def get_total_urls(info_list: list[ChannelData], ipv_type_prefer, origin_type_prefer, rtmp_type=None) -> list:
    """
    Get the total urls from info list
    """
    ipv_prefer_bool = bool(ipv_type_prefer)
    origin_prefer_bool = bool(origin_type_prefer)
    if not ipv_prefer_bool:
        ipv_type_prefer = ["all"]
    if not origin_prefer_bool:
        origin_type_prefer = ["all"]
    categorized_urls = {origin: {ipv_type: [] for ipv_type in ipv_type_prefer} for origin in origin_type_prefer}
    total_urls = []
    for info in info_list:
        channel_id, url, origin, resolution, url_ipv_type, extra_info = (
            info["id"],
            info["url"],
            info["origin"],
            info["resolution"],
            info["ipv_type"],
            info.get("extra_info", ""),
        )
        if not origin:
            continue

        if origin in ["live", "hls"]:
            if not rtmp_type or (rtmp_type and origin in rtmp_type):
                total_urls.append(info)
                continue
            else:
                continue

        if origin == "whitelist":
            total_urls.append(info)
            continue

        if origin_prefer_bool and (origin not in origin_type_prefer):
            continue

        if not extra_info:
            info["extra_info"] = constants.origin_map[origin]

        if not origin_prefer_bool:
            origin = "all"

        if ipv_prefer_bool:
            if url_ipv_type in ipv_type_prefer:
                categorized_urls[origin][url_ipv_type].append(info)
        else:
            categorized_urls[origin]["all"].append(info)

    ipv_num = {ipv_type: 0 for ipv_type in ipv_type_prefer}
    urls_limit = config.urls_limit
    for origin in origin_type_prefer:
        if len(total_urls) >= urls_limit:
            break
        for ipv_type in ipv_type_prefer:
            if len(total_urls) >= urls_limit:
                break
            ipv_type_num = ipv_num[ipv_type]
            ipv_type_limit = config.ipv_limit[ipv_type] or urls_limit
            if ipv_type_num < ipv_type_limit:
                urls = categorized_urls[origin][ipv_type]
                if not urls:
                    continue
                limit = min(
                    max(config.source_limits.get(origin, urls_limit) - ipv_type_num, 0),
                    max(ipv_type_limit - ipv_type_num, 0),
                )
                limit_urls = urls[:limit]
                total_urls.extend(limit_urls)
                ipv_num[ipv_type] += len(limit_urls)
            else:
                continue

    total_urls = total_urls[:urls_limit]

    return total_urls


def get_total_urls_from_sorted_data(data):
    """
    Get the total urls with filter by date and duplicate from sorted data
    """
    if len(data) > config.urls_limit:
        total_urls = [channel_data["url"] for channel_data, _ in filter_by_date(data)]
    else:
        total_urls = [channel_data["url"] for channel_data, _ in data]
    return list(dict.fromkeys(total_urls))[: config.urls_limit]


def check_ipv6_support():
    """
    Check if the system network supports ipv6
    """
    if os.getenv("GITHUB_ACTIONS"):
        return False
    url = "https://ipv6.tokyo.test-ipv6.com/ip/?callback=?&testdomain=test-ipv6.com&testname=test_aaaa"
    try:
        print("Checking if your network supports IPv6...")
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            print("Your network supports IPv6")
            return True
    except Exception:
        pass
    print("Your network does not support IPv6, don't worry, the IPv6 results will be saved")
    return False


def check_ipv_type_match(ipv_type: str) -> bool:
    """
    Check if the ipv type matches
    """
    config_ipv_type = config.ipv_type
    return (
            config_ipv_type == ipv_type
            or config_ipv_type == "全部"
            or config_ipv_type == "all"
    )


def check_url_by_keywords(url, keywords=None):
    """
    Check by URL keywords
    """
    if not keywords:
        return True
    else:
        return any(keyword in url for keyword in keywords)


def merge_objects(*objects, match_key=None):
    """
    Merge objects

    Args:
        *objects: Dictionaries to merge
        match_key: If dict1[key] is a list of dicts, this key will be used to match and merge dicts
    """

    def merge_dicts(dict1, dict2):
        for key, value in dict2.items():
            if key in dict1:
                if isinstance(dict1[key], dict) and isinstance(value, dict):
                    merge_dicts(dict1[key], value)
                elif isinstance(dict1[key], set):
                    dict1[key].update(value)
                elif isinstance(dict1[key], list) and isinstance(value, list):
                    if match_key and all(isinstance(x, dict) for x in dict1[key] + value):
                        existing_items = {item[match_key]: item for item in dict1[key]}
                        for new_item in value:
                            if match_key in new_item and new_item[match_key] in existing_items:
                                merge_dicts(existing_items[new_item[match_key]], new_item)
                            else:
                                dict1[key].append(new_item)
                    else:
                        dict1[key].extend(x for x in value if x not in dict1[key])
                elif value != dict1[key]:
                    dict1[key] = value
            else:
                dict1[key] = value

    merged_dict = {}
    for obj in objects:
        if not isinstance(obj, dict):
            raise TypeError("All input objects must be dictionaries")
        merge_dicts(merged_dict, obj)

    return merged_dict


def get_ip_address():
    """
    Get the IP address
    """
    host = os.getenv("APP_HOST", config.app_host)
    port = os.getenv("APP_PORT", config.app_port)
    return f"{host}:{port}"


def get_epg_url():
    """
    Get the epg result url
    """
    if os.getenv("GITHUB_ACTIONS"):
        repository = os.getenv("GITHUB_REPOSITORY", "Guovin/iptv-api")
        ref = os.getenv("GITHUB_REF", "gd")
        return join_url(config.cdn_url, f"https://raw.githubusercontent.com/{repository}/{ref}/output/epg/epg.gz")
    else:
        return f"{get_ip_address()}/epg/epg.gz"


def convert_to_m3u(path=None, first_channel_name=None, data=None):
    """
    Convert result txt to m3u format
    """
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as file:
            m3u_output = f'#EXTM3U x-tvg-url="{get_epg_url()}"\n'
            current_group = None
            logo_url = join_url(config.cdn_url,
                                config.logo_url) if "raw.githubusercontent.com" in config.logo_url else config.logo_url
            for line in file:
                trimmed_line = line.strip()
                if trimmed_line != "":
                    if "#genre#" in trimmed_line:
                        current_group = trimmed_line.replace(",#genre#", "").strip()
                    else:
                        try:
                            original_channel_name, _, channel_link = map(
                                str.strip, trimmed_line.partition(",")
                            )
                        except:
                            continue
                        processed_channel_name = re.sub(
                            r"(CCTV|CETV)-(\d+)(\+.*)?",
                            lambda m: f"{m.group(1)}{m.group(2)}"
                                      + ("+" if m.group(3) else ""),
                            first_channel_name if current_group == "🕘️更新时间" else original_channel_name,
                        )
                        m3u_output += f'#EXTINF:-1 tvg-name="{processed_channel_name}" tvg-logo="{join_url(logo_url, f'{processed_channel_name}.{config.logo_type}')}"'
                        if current_group:
                            m3u_output += f' group-title="{current_group}"'
                        item_data = {}
                        if data:
                            item_list = data.get(original_channel_name, [])
                            for item in item_list:
                                if item["url"] == channel_link:
                                    item_data = item
                                    break
                        if item_data:
                            catchup = item_data.get("catchup")
                            if catchup:
                                for key, value in catchup.items():
                                    m3u_output += f' {key}="{value}"'
                        m3u_output += f",{original_channel_name}\n"
                        if item_data and config.open_headers:
                            headers = item_data.get("headers")
                            if headers:
                                for key, value in headers.items():
                                    m3u_output += f"#EXTVLCOPT:http-{key.lower()}={value}\n"
                        m3u_output += f"{channel_link}\n"
            m3u_file_path = os.path.splitext(path)[0] + ".m3u"
            with open(m3u_file_path, "w", encoding="utf-8") as m3u_file:
                m3u_file.write(m3u_output)
            # print(f"✅ M3U result file generated at: {m3u_file_path}")


def get_result_file_content(path=None, show_content=False, file_type=None):
    """
    Get the content of the result file
    """
    result_file = (
        os.path.splitext(path)[0] + f".{file_type}"
        if file_type
        else path
    )
    if os.path.exists(result_file):
        if config.open_m3u_result:
            if file_type == "m3u" or not file_type:
                result_file = os.path.splitext(path)[0] + ".m3u"
            if file_type != "txt" and show_content == False:
                return send_file(resource_path(result_file), as_attachment=True)
        with open(result_file, "r", encoding="utf-8") as file:
            content = file.read()
    else:
        content = constants.waiting_tip
    response = make_response(content)
    response.mimetype = 'text/plain'
    return response


def remove_duplicates_from_list(data_list, seen, filter_host=False, ipv6_support=True):
    """
    Remove duplicates from data list
    """
    unique_list = []
    for item in data_list:
        if item["origin"] in ["whitelist", "live", "hls"]:
            continue
        if not ipv6_support and item["ipv_type"] == "ipv6":
            continue
        part = item["host"] if filter_host else item["url"]
        if part not in seen:
            seen.add(part)
            unique_list.append(item)
    return unique_list


def process_nested_dict(data, seen, filter_host=False, ipv6_support=True):
    """
    Process nested dict
    """
    for key, value in data.items():
        if isinstance(value, dict):
            process_nested_dict(value, seen, filter_host, ipv6_support)
        elif isinstance(value, list):
            data[key] = remove_duplicates_from_list(value, seen, filter_host, ipv6_support)


def get_url_host(url):
    """
    Get the url host
    """
    matcher = constants.url_host_pattern.search(url)
    if matcher:
        return matcher.group()
    return None


def add_url_info(url, info):
    """
    Add url info to the URL
    """
    if info:
        separator = "-" if "$" in url else "$"
        url += f"{separator}{info}"
    return url


def format_url_with_cache(url, cache=None):
    """
    Format the URL with cache
    """
    cache = cache or get_url_host(url) or ""
    return add_url_info(url, f"cache:{cache}") if cache else url


def remove_cache_info(string):
    """
    Remove the cache info from the string
    """
    return re.sub(r"[.*]?\$?-?cache:.*", "", string)


def resource_path(relative_path, persistent=False):
    """
    Get the resource path
    """
    base_path = os.path.abspath(".")
    total_path = os.path.join(base_path, relative_path)
    if persistent or os.path.exists(total_path):
        return total_path
    else:
        try:
            base_path = sys._MEIPASS
            return os.path.join(base_path, relative_path)
        except Exception:
            return total_path


def write_content_into_txt(content, path=None, position=None, callback=None):
    """
    Write content into txt file
    """
    if not path:
        return

    mode = "r+" if position == "top" else "a"
    with open(path, mode, encoding="utf-8") as f:
        if position == "top":
            existing_content = f.read()
            f.seek(0, 0)
            f.write(f"{content}\n{existing_content}")
        else:
            f.write(content)

    if callback:
        callback()


def format_name(name: str) -> str:
    """
    Format the  name with sub and replace and lower
    """
    name = opencc_t2s.convert(name)
    for region in constants.region_list:
        name = name.replace(f"{region}｜", "")
    name = constants.sub_pattern.sub("", name)
    for old, new in constants.replace_dict.items():
        name = name.replace(old, new)
    return name.lower()


def get_headers_key_value(content: str) -> dict:
    """
    Get the headers key value from content
    """
    key_value = {}
    for match in constants.key_value_pattern.finditer(content):
        key = match.group("key").strip().replace("http-", "").replace("-", "").lower()
        if "refer" in key:
            key = "referer"
        value = match.group("value").replace('"', "").strip()
        if key and value:
            key_value[key] = value
    return key_value


def get_name_url(content, pattern, open_headers=False, check_url=True):
    """
    Extract name and URL from content using a regex pattern.
    :param content: str, the input content to search.
    :param pattern: re.Pattern, the compiled regex pattern to match.
    :param open_headers: bool, whether to extract headers.
    :param check_url: bool, whether to validate the presence of a URL.
    """
    result = []
    for match in pattern.finditer(content):
        group_dict = match.groupdict()
        name = (group_dict.get("name", "") or "").strip()
        url = (group_dict.get("url", "") or "").strip()
        if not name or (check_url and not url):
            continue
        data = {"name": name, "url": url}
        attributes = {**get_headers_key_value(group_dict.get("attributes", "")),
                      **get_headers_key_value(group_dict.get("options", ""))}
        headers = {
            "User-Agent": attributes.get("useragent", ""),
            "Referer": attributes.get("referer", ""),
            "Origin": attributes.get("origin", "")
        }
        catchup = {
            "catchup": attributes.get("catchup", ""),
            "catchup-source": attributes.get("catchupsource", ""),
        }
        headers = {k: v for k, v in headers.items() if v}
        catchup = {k: v for k, v in catchup.items() if v}
        if not open_headers and headers:
            continue
        if open_headers:
            data["headers"] = headers
        data["catchup"] = catchup
        result.append(data)
    return result


def get_real_path(path) -> str:
    """
    Get the real path
    """
    dir_path, file = os.path.split(path)
    user_real_path = os.path.join(dir_path, 'user_' + file)
    real_path = user_real_path if os.path.exists(user_real_path) else path
    return real_path


def get_urls_from_file(path: str, pattern_search: bool = True) -> list:
    """
    Get the urls from file
    """
    real_path = get_real_path(resource_path(path))
    urls = []
    if os.path.exists(real_path):
        with open(real_path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line or line.startswith("#"):
                    continue
                if pattern_search:
                    match = constants.url_pattern.search(line)
                    if match:
                        urls.append(match.group().strip())
                else:
                    urls.append(line)
    return urls


def get_name_urls_from_file(path: str, format_name_flag: bool = False) -> dict[str, list]:
    """
    Get the name and urls from file
    """
    real_path = get_real_path(resource_path(path))
    name_urls = defaultdict(list)
    if os.path.exists(real_path):
        with open(real_path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line.startswith("#"):
                    continue
                name_url = get_name_url(line, pattern=constants.txt_pattern)
                if name_url and name_url[0]:
                    name = format_name(name_url[0]["name"]) if format_name_flag else name_url[0]["name"]
                    url = name_url[0]["url"]
                    if url not in name_urls[name]:
                        name_urls[name].append(url)
    return name_urls


def get_name_uri_from_dir(path: str) -> dict:
    """
    Get the name and uri from dir, only from file name
    """
    real_path = get_real_path(resource_path(path))
    name_urls = defaultdict(list)
    if os.path.exists(real_path):
        for file in os.listdir(real_path):
            filename = file.rsplit(".", 1)[0]
            name_urls[filename].append(f"{real_path}/{file}")
    return name_urls


def get_datetime_now():
    """
    Get the datetime now
    """
    now = datetime.datetime.now()
    time_zone = pytz.timezone(config.time_zone)
    return now.astimezone(time_zone).strftime("%Y-%m-%d %H:%M:%S")


def get_version_info():
    """
    Get the version info
    """
    with open(resource_path("version.json"), "r", encoding="utf-8") as f:
        return json.load(f)


def join_url(url1: str, url2: str) -> str:
    """
    Get the join url
    :param url1: The first url
    :param url2: The second url
    :return: The join url
    """
    if not url1:
        return url2
    if not url2:
        return url1
    if not url1.endswith("/"):
        url1 += "/"
    return url1 + url2


def add_port_to_url(url: str, port: int) -> str:
    """
    Add port to the url
    """
    parsed = urlparse(url)
    netloc = parsed.netloc
    if parsed.username and parsed.password:
        netloc = f"{parsed.username}:{parsed.password}@{netloc}"
    if port:
        netloc = f"{netloc}:{port}"
    new_url = urlunparse((
        parsed.scheme,
        netloc,
        parsed.path,
        parsed.params,
        parsed.query,
        parsed.fragment
    ))
    return new_url


def get_url_without_scheme(url: str) -> str:
    """
    Get the url without scheme
    """
    parsed = urlparse(url)
    return parsed.netloc + parsed.path


def find_by_id(data: dict, id: int) -> dict:
    """
    Find the nested dict by id
    :param data: target data
    :param id: target id
    :return: target dict
    """
    if isinstance(data, dict) and 'id' in data and data['id'] == id:
        return data
    for key, value in data.items():
        if isinstance(value, dict):
            result = find_by_id(value, id)
            if result is not None:
                return result
        elif isinstance(value, list):
            for item in value:
                if isinstance(item, dict):
                    result = find_by_id(item, id)
                    if result is not None:
                        return result
    return {}


def custom_print(*args, **kwargs):
    """
    Custom print
    """
    if not custom_print.disable:
        print(*args, **kwargs)


def get_urls_len(data) -> int:
    """
    Get the dict urls length
    """
    urls = set(
        url_info["url"]
        for value in data.values()
        for url_info_list in value.values()
        for url_info in url_info_list
    )
    return len(urls)