add skip-checksum

2025-07-09 21:32:43 +00:00 · 2024-08-22 16:57:08 +08:00 · 2024-08-22 16:57:08 +08:00 · 5f2382ca9c
commit 5f2382ca9c
parent d731922d53
1 changed files with 159 additions and 83 deletions
--- a/apt-sync.py
+++ b/apt-sync.py
@ -4,7 +4,6 @@ import traceback
 import os
 import re
 import shutil
 import subprocess as sp
 import argparse
 import bz2
 import gzip
@ -23,21 +22,27 @@ requests.utils.default_user_agent = lambda: APT_SYNC_USER_AGENT
 # set preferred address family
 import requests.packages.urllib3.util.connection as urllib3_cn
-USE_ADDR_FAMILY = os.getenv('USE_ADDR_FAMILY', '').strip().lower()
+
-if USE_ADDR_FAMILY != '':
+USE_ADDR_FAMILY = os.getenv("USE_ADDR_FAMILY", "").strip().lower()
-    assert USE_ADDR_FAMILY in ['ipv4', 'ipv6'], "USE_ADDR_FAMILY must be either ipv4 or ipv6"
+if USE_ADDR_FAMILY != "":
-    urllib3_cn.allowed_gai_family = lambda: socket.AF_INET if USE_ADDR_FAMILY == 'ipv4' else socket.AF_INET6
+    assert USE_ADDR_FAMILY in [
        "ipv4",
        "ipv6",
    ], "USE_ADDR_FAMILY must be either ipv4 or ipv6"
    urllib3_cn.allowed_gai_family = lambda: (
        socket.AF_INET if USE_ADDR_FAMILY == "ipv4" else socket.AF_INET6
    )
 OS_TEMPLATE = {
-    'ubuntu-lts': ["focal", "jammy", "noble"],
+    "ubuntu-lts": ["focal", "jammy", "noble"],
-    'debian-current': ["buster", "bullseye", "bookworm"],
+    "debian-current": ["bullseye", "bookworm"],
-    'debian-latest2': ["bullseye", "bookworm"],
+    "debian-latest2": ["bullseye", "bookworm"],
-    'debian-latest': ["bookworm"],
+    "debian-latest": ["bookworm"],
 }
-ARCH_NO_PKGIDX = ['dep11', 'i18n', 'cnf']
+ARCH_NO_PKGIDX = ["dep11", "i18n", "cnf"]
-MAX_RETRY=int(os.getenv('MAX_RETRY', '3'))
+MAX_RETRY = int(os.getenv("MAX_RETRY", "3"))
-DOWNLOAD_TIMEOUT=int(os.getenv('DOWNLOAD_TIMEOUT', '1800'))
+DOWNLOAD_TIMEOUT = int(os.getenv("DOWNLOAD_TIMEOUT", "1800"))
-REPO_SIZE_FILE = os.getenv('REPO_SIZE_FILE', '')
+REPO_SIZE_FILE = os.getenv("REPO_SIZE_FILE", "")
 pattern_os_template = re.compile(r"@\{(.+)\}")
 pattern_package_name = re.compile(r"^Filename: (.+)$", re.MULTILINE)
@ -45,11 +50,13 @@ pattern_package_size = re.compile(r"^Size: (\d+)$", re.MULTILINE)
 pattern_package_sha256 = re.compile(r"^SHA256: (\w{64})$", re.MULTILINE)
 download_cache = dict()
 def check_args(prop: str, lst: List[str]):
    for s in lst:
-        if len(s)==0 or ' ' in s:
+        if len(s) == 0 or " " in s:
            raise ValueError(f"Invalid item in {prop}: {repr(s)}")
 def replace_os_template(os_list: List[str]) -> List[str]:
    ret = []
    for i in os_list:
@ -57,46 +64,54 @@ def replace_os_template(os_list: List[str]) -> List[str]:
        if matched:
            for os in OS_TEMPLATE[matched.group(1)]:
                ret.append(pattern_os_template.sub(os, i))
-        elif i.startswith('@'):
+        elif i.startswith("@"):
            ret.extend(OS_TEMPLATE[i[1:]])
        else:
            ret.append(i)
    return ret
 def check_and_download(url: str, dst_file: Path, caching=False) -> int:
    try:
        if caching:
            if url in download_cache:
                print(f"Using cached content: {url}", flush=True)
-                with dst_file.open('wb') as f:
+                with dst_file.open("wb") as f:
                    f.write(download_cache[url])
                return 0
            download_cache[url] = bytes()
        start = time.time()
        with requests.get(url, stream=True, timeout=(5, 10)) as r:
            r.raise_for_status()
-            if 'last-modified' in r.headers:
+            if "last-modified" in r.headers:
                remote_ts = parsedate_to_datetime(
-                    r.headers['last-modified']).timestamp()
+                    r.headers["last-modified"]
-            else: remote_ts = None
+                ).timestamp()
            else:
                remote_ts = None
-            with dst_file.open('wb') as f:
+            with dst_file.open("wb") as f:
                for chunk in r.iter_content(chunk_size=1024**2):
                    if time.time() - start > DOWNLOAD_TIMEOUT:
                        raise TimeoutError("Download timeout")
-                    if not chunk: continue # filter out keep-alive new chunks
+                    if not chunk:
                        continue  # filter out keep-alive new chunks
                    f.write(chunk)
-                    if caching: download_cache[url] += chunk
+                    if caching:
                        download_cache[url] += chunk
            if remote_ts is not None:
                os.utime(dst_file, (remote_ts, remote_ts))
        return 0
    except BaseException as e:
        print(e, flush=True)
-        if dst_file.is_file(): dst_file.unlink()
+        if dst_file.is_file():
-        if url in download_cache: del download_cache[url]
+            dst_file.unlink()
        if url in download_cache:
            del download_cache[url]
    return 1
 def mkdir_with_dot_tmp(folder: Path) -> Tuple[Path, Path]:
    tmpdir = folder / ".tmp"
    if tmpdir.is_dir():
@ -104,9 +119,10 @@ def mkdir_with_dot_tmp(folder: Path)->Tuple[Path, Path]:
    tmpdir.mkdir(parents=True, exist_ok=True)
    return (folder, tmpdir)
 def move_files_in(src: Path, dst: Path):
    empty = True
-    for file in src.glob('*'):
+    for file in src.glob("*"):
        empty = False
        print(f"moving {file} to {dst}")
        # shutil.move(str(file), str(dst))
@ -119,7 +135,17 @@ def move_files_in(src: Path, dst: Path):
    if empty:
        print(f"{src} is empty")
-def apt_mirror(base_url: str, dist: str, repo: str, arch: str, dest_base_dir: Path, deb_set: Dict[str, int], skip_checksum: bool)->int:
+
 def apt_mirror(
    base_url: str,
    dist: str,
    repo: str,
    arch: str,
    dest_base_dir: Path,
    deb_set: Dict[str, int,
    ],
    skip_checksum: bool
 ) -> int:
    if not dest_base_dir.is_dir():
        print("Destination directory is empty, cannot continue")
        return 1
@ -127,14 +153,27 @@ def apt_mirror(base_url: str, dist: str, repo: str, arch: str, dest_base_dir: Pa
    # download Release files
    dist_dir, dist_tmp_dir = mkdir_with_dot_tmp(dest_base_dir / "dists" / dist)
-    check_and_download(f"{base_url}/dists/{dist}/InRelease",dist_tmp_dir / "InRelease", caching=True)
+    check_and_download(
-    if check_and_download(f"{base_url}/dists/{dist}/Release",dist_tmp_dir / "Release", caching=True) != 0:
+        f"{base_url}/dists/{dist}/InRelease", dist_tmp_dir / "InRelease", caching=True
    )
    if (
        check_and_download(
            f"{base_url}/dists/{dist}/Release", dist_tmp_dir / "Release", caching=True
        )
        != 0
    ):
        print("Invalid Repository")
        if not (dist_dir / "Release").is_file():
-            print(f"{dist_dir/'Release'} never existed, upstream may not provide packages for {dist}, ignore this error")
+            print(
                f"{dist_dir/'Release'} never existed, upstream may not provide packages for {dist}, ignore this error"
            )
            return 0
        return 1
-    check_and_download(f"{base_url}/dists/{dist}/Release.gpg",dist_tmp_dir / "Release.gpg", caching=True)
+    check_and_download(
        f"{base_url}/dists/{dist}/Release.gpg",
        dist_tmp_dir / "Release.gpg",
        caching=True,
    )
    comp_dir, comp_tmp_dir = mkdir_with_dot_tmp(dist_dir / repo)
@ -148,12 +187,16 @@ def apt_mirror(base_url: str, dist: str, repo: str, arch: str, dest_base_dir: Pa
        for line in fd:
            if cnt_start:
                fields = line.split()
-                if len(fields) != 3 or len(fields[0]) != 64: # 64 is SHA-256 checksum length
+                if (
                    len(fields) != 3 or len(fields[0]) != 64
                ):  # 64 is SHA-256 checksum length
                    break
                checksum, filesize, filename = tuple(fields)
-                if filename.startswith(f"{repo}/{arch_dir}/") or \
+                if (
-                   filename.startswith(f"{repo}/Contents-{arch}") or \
+                    filename.startswith(f"{repo}/{arch_dir}/")
-                   filename.startswith(f"Contents-{arch}"):
+                    or filename.startswith(f"{repo}/Contents-{arch}")
                    or filename.startswith(f"Contents-{arch}")
                ):
                    fn = Path(filename)
                    if len(fn.parts) <= 3:
                        # Contents-amd64.gz
@ -163,7 +206,13 @@ def apt_mirror(base_url: str, dist: str, repo: str, arch: str, dest_base_dir: Pa
                    else:
                        # main/dep11/by-hash/MD5Sum/0af5c69679a24671cfd7579095a9cb5e
                        # deep_tmp_dir is in pkgidx_tmp_dir hence no extra garbage collection needed
-                        deep_tmp_dir = dist_dir / Path(fn.parts[0]) / Path(fn.parts[1]) / ".tmp" / Path('/'.join(fn.parts[2:-1]))
+                        deep_tmp_dir = (
                            dist_dir
                            / Path(fn.parts[0])
                            / Path(fn.parts[1])
                            / ".tmp"
                            / Path("/".join(fn.parts[2:-1]))
                        )
                        deep_tmp_dir.mkdir(parents=True, exist_ok=True)
                        pkgidx_file = deep_tmp_dir / fn.name
                else:
@ -174,33 +223,41 @@ def apt_mirror(base_url: str, dist: str, repo: str, arch: str, dest_base_dir: Pa
                    print("Failed to download:", pkglist_url)
                    continue
-                with pkgidx_file.open('rb') as t: content = t.read()
+                with pkgidx_file.open("rb") as t:
                    content = t.read()
                if len(content) != int(filesize):
-                    print(f"Invalid size of {pkgidx_file}, expected {filesize}, skipped")
+                    print(
                        f"Invalid size of {pkgidx_file}, expected {filesize}, skipped"
                    )
                    pkgidx_file.unlink()
                    continue
                if not skip_checksum and hashlib.sha256(content).hexdigest() != checksum:
-                    print(f"Invalid checksum of {pkgidx_file}, expected {checksum}, skipped")
+                    print(
                        f"Invalid checksum of {pkgidx_file}, expected {checksum}, skipped"
                    )
                    pkgidx_file.unlink()
                    continue
-                if pkgidx_content is None and pkgidx_file.stem == 'Packages':
+                if pkgidx_content is None and pkgidx_file.stem == "Packages":
-                    print(f"getting packages index content from {pkgidx_file.name}", flush=True)
+                    print(
                        f"getting packages index content from {pkgidx_file.name}",
                        flush=True,
                    )
                    suffix = pkgidx_file.suffix
-                    if suffix == '.xz':
+                    if suffix == ".xz":
-                        pkgidx_content = lzma.decompress(content).decode('utf-8')
+                        pkgidx_content = lzma.decompress(content).decode("utf-8")
-                    elif suffix == '.bz2':
+                    elif suffix == ".bz2":
-                        pkgidx_content = bz2.decompress(content).decode('utf-8')
+                        pkgidx_content = bz2.decompress(content).decode("utf-8")
-                    elif suffix == '.gz':
+                    elif suffix == ".gz":
-                        pkgidx_content = gzip.decompress(content).decode('utf-8')
+                        pkgidx_content = gzip.decompress(content).decode("utf-8")
-                    elif suffix == '':
+                    elif suffix == "":
-                        pkgidx_content = content.decode('utf-8')
+                        pkgidx_content = content.decode("utf-8")
                    else:
                        print("unsupported format")
            # Currently only support SHA-256 checksum, because
            # "Clients may not use the MD5Sum and SHA1 fields for security purposes, and must require a SHA256 or a SHA512 field."
            # from https://wiki.debian.org/DebianRepository/Format#A.22Release.22_files
-            if line.startswith('SHA256:'):
+            if line.startswith("SHA256:"):
                cnt_start = True
    if not cnt_start:
        print("Cannot find SHA-256 checksum")
@ -219,6 +276,7 @@ def apt_mirror(base_url: str, dist: str, repo: str, arch: str, dest_base_dir: Pa
        except:
            traceback.print_exc()
            return 1
    if arch in ARCH_NO_PKGIDX:
        if collect_tmp_dir() == 1:
            return 1
@ -227,8 +285,10 @@ def apt_mirror(base_url: str, dist: str, repo: str, arch: str, dest_base_dir: Pa
    if pkgidx_content is None:
        print("index is empty, failed")
-        if len(list(pkgidx_dir.glob('Packages*'))) == 0:
+        if len(list(pkgidx_dir.glob("Packages*"))) == 0:
-            print(f"{pkgidx_dir/'Packages'} never existed, upstream may not provide {dist}/{repo}/{arch}, ignore this error")
+            print(
                f"{pkgidx_dir/'Packages'} never existed, upstream may not provide {dist}/{repo}/{arch}, ignore this error"
            )
            return 0
        return 1
@ -236,7 +296,7 @@ def apt_mirror(base_url: str, dist: str, repo: str, arch: str, dest_base_dir: Pa
    err = 0
    deb_count = 0
    deb_size = 0
-    for pkg in pkgidx_content.split('\n\n'):
+    for pkg in pkgidx_content.split("\n\n"):
        if len(pkg) < 10:  # ignore blanks
            continue
        try:
@ -255,14 +315,14 @@ def apt_mirror(base_url: str, dist: str, repo: str, arch: str, dest_base_dir: Pa
        dest_dir = dest_filename.parent
        if not dest_dir.is_dir():
            dest_dir.mkdir(parents=True, exist_ok=True)
-        if dest_filename.suffix == '.deb':
+        if dest_filename.suffix == ".deb":
            deb_set[str(dest_filename.relative_to(dest_base_dir))] = pkg_size
        if dest_filename.is_file() and dest_filename.stat().st_size == pkg_size:
            print(f"Skipping {pkg_filename}, size {pkg_size}")
            continue
        pkg_url = f"{base_url}/{pkg_filename}"
-        dest_tmp_filename = dest_filename.with_name('._syncing_.' + dest_filename.name)
+        dest_tmp_filename = dest_filename.with_name("._syncing_." + dest_filename.name)
        for retry in range(MAX_RETRY):
            print(f"downloading {pkg_url} to {dest_filename}", flush=True)
            # break # dry run
@ -289,13 +349,18 @@ def apt_mirror(base_url: str, dist: str, repo: str, arch: str, dest_base_dir: Pa
    print(f"{deb_count} packages, {deb_size} bytes in total", flush=True)
    return err
 def apt_delete_old_debs(dest_base_dir: Path, remote_set: Dict[str, int], dry_run: bool):
-    on_disk = set([
+    on_disk = set(
-        str(i.relative_to(dest_base_dir)) for i in dest_base_dir.glob('**/*.deb')])
+        [str(i.relative_to(dest_base_dir)) for i in dest_base_dir.glob("**/*.deb")]
    )
    deleting = on_disk - remote_set.keys()
    # print(on_disk)
    # print(remote_set)
-    print(f"Deleting {len(deleting)} packages not in the index{' (dry run)' if dry_run else ''}", flush=True)
+    print(
        f"Deleting {len(deleting)} packages not in the index{' (dry run)' if dry_run else ''}",
        flush=True,
    )
    for i in deleting:
        if dry_run:
            print("Will delete", i)
@ -303,6 +368,7 @@ def apt_delete_old_debs(dest_base_dir: Path, remote_set: Dict[str, int], dry_run
            print("Deleting", i)
            (dest_base_dir / i).unlink()
 def main():
    parser = argparse.ArgumentParser()
@ -311,33 +377,37 @@ def main():
    parser.add_argument("component", type=str, help="e.g. multiverse,contrib")
    parser.add_argument("arch", type=str, help="e.g. i386,amd64")
    parser.add_argument("working_dir", type=Path, help="working directory")
-    parser.add_argument("--delete", action='store_true',
+    parser.add_argument(
-                        help='delete unreferenced package files')
+        "--delete", action="store_true", help="delete unreferenced package files"
-    parser.add_argument("--delete-dry-run", action='store_true',
+    )
-                        help='print package files to be deleted only')
+    parser.add_argument(
        "--delete-dry-run",
        action="store_true",
        help="print package files to be deleted only",
    )
    parser.add_argument("--skip-checksum", action='store_true',
                        help='skip checksum validation')
    args = parser.parse_args()
    # generate lists of os codenames
-    os_list = args.os_version.split(',')
+    os_list = args.os_version.split(",")
    check_args("os_version", os_list)
    os_list = replace_os_template(os_list)
    # generate a list of components and archs for each os codename
    def generate_list_for_oses(raw: str, name: str) -> List[List[str]]:
        n_os = len(os_list)
-        if ':' in raw:
+        if ":" in raw:
            # specify os codenames for each component
            lists = []
-            for l in raw.split(':'):
+            for l in raw.split(":"):
-                list_for_os = l.split(',')
+                list_for_os = l.split(",")
                check_args(name, list_for_os)
                lists.append(list_for_os)
            assert len(lists) == n_os, f"{name} must be specified for each component"
        else:
            # use same os codenames for all components
-            l = raw.split(',')
+            l = raw.split(",")
            check_args(name, l)
            lists = [l] * n_os
        return lists
@ -352,7 +422,12 @@ def main():
    for os, arch_list, comp_list in zip(os_list, arch_lists, component_lists):
        for comp in comp_list:
            for arch in arch_list:
-                if apt_mirror(args.base_url, os, comp, arch, args.working_dir, deb_set=deb_set, skip_checksum=args.skip_checksum) != 0:
+                if (
                    apt_mirror(
                        args.base_url, os, comp, arch, args.working_dir, deb_set=deb_set, skip_checksum=args.skip_checksum
                    )
                    != 0
                ):
                    failed.append((os, comp, arch))
    if len(failed) > 0:
        print(f"Failed APT repos of {args.base_url}: ", failed)
@ -365,5 +440,6 @@ def main():
            total_size = sum(deb_set.values())
            fd.write(f"+{total_size}")
 if __name__ == "__main__":
    main()