#!/usr/bin/env python3 import traceback import os import subprocess as sp import tempfile import argparse import bz2 import gzip import sqlite3 import traceback import time from email.utils import parsedate_to_datetime import xml.etree.ElementTree as ET from pathlib import Path from typing import List, Dict import requests REPO_SIZE_FILE = os.getenv("REPO_SIZE_FILE", "") DOWNLOAD_TIMEOUT = int(os.getenv("DOWNLOAD_TIMEOUT", "1800")) REPO_STAT = {} def calc_repo_size(path: Path): dbfiles = path.glob("repodata/*primary.*") with tempfile.NamedTemporaryFile() as tmp: dec = None dbfile = None for db in dbfiles: dbfile = db suffixes = db.suffixes if suffixes[-1] == ".bz2": dec = bz2.decompress suffixes = suffixes[:-1] elif suffixes[-1] == ".gz": dec = gzip.decompress suffixes = suffixes[:-1] elif suffixes[-1] in (".sqlite", ".xml"): dec = lambda x: x if dec is None: print(f"Failed to read from {path}: {list(dbfiles)}", flush=True) return with db.open("rb") as f: tmp.write(dec(f.read())) tmp.flush() if suffixes[-1] == ".sqlite": conn = sqlite3.connect(tmp.name) c = conn.cursor() c.execute("select sum(size_package),count(1) from packages") size, cnt = c.fetchone() conn.close() elif suffixes[-1] == ".xml": try: tree = ET.parse(tmp.name) root = tree.getroot() assert root.tag.endswith("metadata") cnt, size = 0, 0 for location in root.findall( "./{http://linux.duke.edu/metadata/common}package/{http://linux.duke.edu/metadata/common}size" ): size += int(location.attrib["package"]) cnt += 1 except: traceback.print_exc() return else: print(f"Unknown suffix {suffixes}") return print(f"Repository {path}:") print(f" {cnt} packages, {size} bytes in total", flush=True) global REPO_STAT REPO_STAT[str(path)] = (size, cnt) if cnt > 0 else (0, 0) # size can be None def check_and_download(url: str, dst_file: Path) -> int: try: start = time.time() with requests.get(url, stream=True, timeout=(5, 10)) as r: r.raise_for_status() if "last-modified" in r.headers: remote_ts = parsedate_to_datetime( r.headers["last-modified"] ).timestamp() else: remote_ts = None with dst_file.open("wb") as f: for chunk in r.iter_content(chunk_size=1024**2): if time.time() - start > DOWNLOAD_TIMEOUT: raise TimeoutError("Download timeout") if not chunk: continue # filter out keep-alive new chunks f.write(chunk) if remote_ts is not None: os.utime(dst_file, (remote_ts, remote_ts)) return 0 except BaseException as e: print(e, flush=True) if dst_file.is_file(): dst_file.unlink() return 1 def download_repodata(url: str, path: Path) -> int: path = path / "repodata" path.mkdir(exist_ok=True) oldfiles = set(path.glob("*.*")) newfiles = set() if check_and_download(url + "/repodata/repomd.xml", path / ".repomd.xml") != 0: print(f"Failed to download the repomd.xml of {url}") return 1 try: tree = ET.parse(path / ".repomd.xml") root = tree.getroot() assert root.tag.endswith("repomd") for location in root.findall( "./{http://linux.duke.edu/metadata/repo}data/{http://linux.duke.edu/metadata/repo}location" ): href = location.attrib["href"] assert len(href) > 9 and href[:9] == "repodata/" fn = path / href[9:] newfiles.add(fn) if check_and_download(url + "/" + href, fn) != 0: print(f"Failed to download the {href}") return 1 except BaseException as e: traceback.print_exc() return 1 (path / ".repomd.xml").rename(path / "repomd.xml") # update the repomd.xml newfiles.add(path / "repomd.xml") for i in oldfiles - newfiles: print(f"Deleting old files: {i}") i.unlink() def check_args(prop: str, lst: List[str]): for s in lst: if len(s) == 0 or " " in s: raise ValueError(f"Invalid item in {prop}: {repr(s)}") def substitute_vars(s: str, vardict: Dict[str, str]) -> str: for key, val in vardict.items(): tpl = "@{" + key + "}" s = s.replace(tpl, val) return s def main(): parser = argparse.ArgumentParser() parser.add_argument("base_url", type=str, help="base URL") parser.add_argument("os_version", type=str, help="e.g. 7-8,9") parser.add_argument( "component", type=str, help="e.g. mysql56-community,mysql57-community" ) parser.add_argument("arch", type=str, help="e.g. x86_64,aarch64") parser.add_argument("repo_name", type=str, help="e.g. @{comp}-el@{os_ver}") parser.add_argument("working_dir", type=Path, help="working directory") parser.add_argument( "--download-repodata", action="store_true", help="download repodata files instead of generating them", ) parser.add_argument( "--pass-arch-to-reposync", action="store_true", help="""pass --arch to reposync to further filter packages by 'arch' field in metadata (NOT recommended, prone to missing packages in some repositories, e.g. mysql)""", ) args = parser.parse_args() os_list = [] for os_version in args.os_version.split(","): if "-" in os_version and "-stream" not in os_version: dash = os_version.index("-") os_list = os_list + [ str(i) for i in range(int(os_version[:dash]), 1 + int(os_version[dash + 1 :])) ] else: os_list.append(os_version) check_args("os_version", os_list) component_list = args.component.split(",") check_args("component", component_list) arch_list = args.arch.split(",") check_args("arch", arch_list) failed = [] args.working_dir.mkdir(parents=True, exist_ok=True) cache_dir = tempfile.mkdtemp() def combination_os_comp(arch: str): for os in os_list: for comp in component_list: vardict = { "arch": arch, "os_ver": os, "comp": comp, } name = substitute_vars(args.repo_name, vardict) url = substitute_vars(args.base_url, vardict) try: probe_url = ( url + ("" if url.endswith("/") else "/") + "repodata/repomd.xml" ) r = requests.head(probe_url, timeout=(7, 7)) if r.status_code < 400 or r.status_code == 403: yield (name, url) else: print(probe_url, "->", r.status_code) except: traceback.print_exc() for arch in arch_list: dest_dirs = [] conf = tempfile.NamedTemporaryFile("w", suffix=".conf") conf.write( """ [main] keepcache=0 """ ) for name, url in combination_os_comp(arch): conf.write( f""" [{name}] name={name} baseurl={url} repo_gpgcheck=0 gpgcheck=0 enabled=1 """ ) dst = (args.working_dir / name).absolute() dst.mkdir(parents=True, exist_ok=True) dest_dirs.append(dst) conf.flush() # sp.run(["cat", conf.name]) # sp.run(["ls", "-la", cache_dir]) if len(dest_dirs) == 0: print("Nothing to sync", flush=True) failed.append(("", arch)) continue cmd_args = [ "dnf", "reposync", "-c", conf.name, "--delete", "-p", str(args.working_dir.absolute()), ] if args.pass_arch_to_reposync: cmd_args += ["--arch", arch] print(f"Launching dnf reposync with command: {cmd_args}", flush=True) ret = sp.run(cmd_args) if ret.returncode != 0: failed.append((name, arch)) continue for path in dest_dirs: path.mkdir(exist_ok=True) if args.download_repodata: download_repodata(url, path) else: cmd_args = [ "createrepo_c", "--update", "-v", "-c", cache_dir, "-o", str(path), str(path), ] print(f"Launching createrepo with command: {cmd_args}", flush=True) ret = sp.run(cmd_args) calc_repo_size(path) if len(failed) > 0: print(f"Failed YUM repos: {failed}", flush=True) else: if len(REPO_SIZE_FILE) > 0: with open(REPO_SIZE_FILE, "a") as fd: total_size = sum([r[0] for r in REPO_STAT.values()]) fd.write(f"+{total_size}") if __name__ == "__main__": main()