From 18a78ebf83971381d2119c327c095eada734e26e Mon Sep 17 00:00:00 2001 From: z4yx Date: Sun, 19 Apr 2020 16:11:13 +0800 Subject: [PATCH] reimplement the apt-download in Python (#16) --- apt-sync.py | 203 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 189 insertions(+), 14 deletions(-) diff --git a/apt-sync.py b/apt-sync.py index 5c7de8e..5e6c030 100755 --- a/apt-sync.py +++ b/apt-sync.py @@ -8,20 +8,27 @@ import shutil import subprocess as sp import tempfile import argparse +import bz2 +import gzip +import time +from email.utils import parsedate_to_datetime from pathlib import Path -from typing import List +from typing import List, Tuple + +import requests OS_TEMPLATE = { 'ubuntu-current': ["trusty", "xenial", "bionic", "eoan"], 'ubuntu-lts': ["trusty", "xenial", "bionic"], 'debian-current': ["jessie", "stretch", "buster"], } +MAX_RETRY=int(os.getenv('MAX_RETRY', '3')) +DOWNLOAD_TIMEOUT=int(os.getenv('DOWNLOAD_TIMEOUT', '1800')) pattern_os_template = re.compile(r"@\{(.+)\}") - -apt_download = Path(__file__).parent / "helpers" / "apt-download-binary" -if not apt_download.is_file(): - raise OSError(f"File not found: {apt_download}") +pattern_package_name = re.compile(r"^Filename: (.+)$", re.MULTILINE) +pattern_package_size = re.compile(r"^Size: (\d+)$", re.MULTILINE) +pattern_package_sha256 = re.compile(r"^SHA256: (\w{64})$", re.MULTILINE) def check_args(prop: str, lst: List[str]): for s in lst: @@ -41,6 +48,168 @@ def replace_os_template(os_list: List[str]) -> List[str]: ret.append(i) return ret +def check_and_download(url: str, dst_file: Path)->int: + try: + start = time.time() + with requests.get(url, stream=True, timeout=(5, 10)) as r: + r.raise_for_status() + if 'last-modified' in r.headers: + remote_ts = parsedate_to_datetime( + r.headers['last-modified']).timestamp() + else: remote_ts = None + + with dst_file.open('wb') as f: + for chunk in r.iter_content(chunk_size=1024**2): + if chunk: # filter out keep-alive new chunks + f.write(chunk) + if time.time() - start > DOWNLOAD_TIMEOUT: + raise TimeoutError("Download timeout") + if remote_ts is not None: + os.utime(dst_file, (remote_ts, remote_ts)) + return 0 + except: + traceback.print_exc() + if dst_file.is_file(): + dst_file.unlink() + return 1 + +def mkdir_with_dot_tmp(folder: Path)->Tuple[Path, Path]: + tmpdir = folder / ".tmp" + if tmpdir.is_dir(): + shutil.rmtree(tmpdir) + tmpdir.mkdir(parents=True, exist_ok=True) + return (folder, tmpdir) + +def move_files_in(src: Path, dst: Path): + empty = True + for file in src.glob('*'): + empty = False + print(f"move {file} to {dst}") + shutil.move(file, dst) + if empty: + raise ValueError(f"{src} is empty") + +def apt_mirror(base_url: str, dist: str, repo: str, arch: str, dest_base_dir: Path, filelist: str = '/dev/null')->int: + if not dest_base_dir.is_dir(): + print("Destination directory is empty, cannot continue") + return 1 + print(f"Started mirroring {base_url} {dist}, {repo}, {arch}!") + + # download Release files + dist_dir,dist_tmp_dir = mkdir_with_dot_tmp(dest_base_dir / "dists" / dist) + check_and_download(f"{base_url}/dists/{dist}/Contents-{arch}.gz",dist_tmp_dir / f"Contents-{arch}.gz") + check_and_download(f"{base_url}/dists/{dist}/InRelease",dist_tmp_dir / "InRelease") + if check_and_download(f"{base_url}/dists/{dist}/Release",dist_tmp_dir / "Release") != 0: + print("Invalid Repository") + return 1 + check_and_download(f"{base_url}/dists/{dist}/Release.gpg",dist_tmp_dir / "Release.gpg") + + # download Contents files + comp_dir,comp_tmp_dir = mkdir_with_dot_tmp(dist_dir / repo) + check_and_download(f"{base_url}/dists/{dist}/{repo}/Contents-{arch}", comp_tmp_dir/f"Contents-{arch}") + check_and_download(f"{base_url}/dists/{dist}/{repo}/Contents-{arch}.gz", comp_tmp_dir/f"Contents-{arch}.gz") + check_and_download(f"{base_url}/dists/{dist}/{repo}/Contents-{arch}.bz2", comp_tmp_dir/f"Contents-{arch}.bz2") + + # load Package Index URLs from the Release file + release_file = dist_tmp_dir / "Release" + pkgidx_dir,pkgidx_tmp_dir = mkdir_with_dot_tmp(comp_dir / f"binary-{arch}") + with open(release_file, "r") as fd: + pkgidx_content=None + cnt_start=False + for line in fd: + if cnt_start: + fields = line.split() + if len(fields) != 3 or len(fields[0]) != 64: # 64 is SHA-256 checksum length + break + checksum, filesize, filename = tuple(fields) + if filename.startswith(f"{repo}/binary-{arch}"): + pkgidx_filename = Path(filename).name # base name + pkgidx_file = pkgidx_tmp_dir / pkgidx_filename + pkglist_url = f"{base_url}/dists/{dist}/{filename}" + if check_and_download(pkglist_url, pkgidx_file) != 0: + print("Failed to download:", pkglist_url) + return 1 + + with pkgidx_file.open('rb') as t: content = t.read() + if len(content) != int(filesize): + print(f"Invalid size of {pkgidx_file}, expected {filesize}") + return 1 + if hashlib.sha256(content).hexdigest() != checksum: + print(f"Invalid checksum of {pkgidx_file}, expected {checksum}") + return 1 + if pkgidx_content is None: + print("getting packages index content") + suffix = pkgidx_file.suffix + if suffix == '.bz2': + pkgidx_content = bz2.decompress(content).decode('utf-8') + elif suffix == '.gz': + pkgidx_content = gzip.decompress(content).decode('utf-8') + elif suffix == '': + pkgidx_content = content.decode('utf-8') + + # Currently only support SHA-256 checksum, because + # "Clients may not use the MD5Sum and SHA1 fields for security purposes, and must require a SHA256 or a SHA512 field." + # from https://wiki.debian.org/DebianRepository/Format#A.22Release.22_files + if line.startswith('SHA256:'): + cnt_start = True + if not cnt_start: + print("Cannot find SHA-256 checksum") + return 1 + if pkgidx_content is None: + print("index is empty, failed") + return 1 + + # Download packages + err = 0 + for pkg in pkgidx_content.split('\n\n'): + try: + pkg_filename = pattern_package_name.search(pkg).group(1) + pkg_size = int(pattern_package_size.search(pkg).group(1)) + pkg_checksum = pattern_package_sha256.search(pkg).group(1) + except: + print("Failed to parse one package description") + traceback.print_exc() + err = 1 + continue + + dest_filename = dest_base_dir / pkg_filename + dest_dir = dest_filename.parent + if not dest_dir.is_dir(): + dest_dir.mkdir(parents=True, exist_ok=True) + if dest_filename.is_file() and dest_filename.stat().st_size == pkg_size: + print(f"Skipping {pkg_filename}, size {pkg_size}") + continue + + pkg_url=f"{base_url}/{pkg_filename}" + for retry in range(MAX_RETRY): + print(f"downloading {pkg_url} to {dest_filename}") + if check_and_download(pkg_url, dest_filename) != 0: + continue + + sha = hashlib.sha256() + with dest_filename.open("rb") as f: + for block in iter(lambda: f.read(1024**2), b""): + sha.update(block) + if sha.hexdigest() != pkg_checksum: + print(f"Invalid checksum of {dest_filename}, expected {pkg_checksum}") + dest_filename.unlink() + continue + break + else: + print(f"Failed to download {dest_filename}") + err = 1 + + try: + move_files_in(pkgidx_tmp_dir, pkgidx_dir) + move_files_in(comp_tmp_dir, comp_dir) + move_files_in(dist_tmp_dir, dist_dir) + except: + traceback.print_exc() + return 1 + + print(f"Mirroring {base_url} {dist}, {repo}, {arch} done!") + return err + def main(): parser = argparse.ArgumentParser() @@ -66,18 +235,24 @@ def main(): filelist = tempfile.mkstemp() failed = [] + # apt_download = Path(__file__).parent / "helpers" / "apt-download-binary" + # if not apt_download.is_file(): + # raise OSError(f"File not found: {apt_download}") + for os in os_list: for comp in component_list: for arch in arch_list: - shell_args = [ - str(apt_download.absolute()), - args.base_url, - os, comp, arch, - str(args.working_dir.absolute()), - filelist[1] ] - # print(shell_args) - ret = sp.run(shell_args) - if ret.returncode != 0: + # shell_args = [ + # str(apt_download.absolute()), + # args.base_url, + # os, comp, arch, + # str(args.working_dir.absolute()), + # filelist[1] ] + # # print(shell_args) + # ret = sp.run(shell_args) + # if ret.returncode != 0: + # failed.append((os, comp, arch)) + if apt_mirror(args.base_url, os, comp, arch, args.working_dir, filelist[1]) != 0: failed.append((os, comp, arch)) if len(failed) > 0: print(f"Failed APT repos of {args.base_url}: ", failed)