add skip-checksum

This commit is contained in:
yylbfyl 2024-08-22 16:57:08 +08:00
parent d731922d53
commit 5f2382ca9c

View File

@ -4,7 +4,6 @@ import traceback
import os import os
import re import re
import shutil import shutil
import subprocess as sp
import argparse import argparse
import bz2 import bz2
import gzip import gzip
@ -23,21 +22,27 @@ requests.utils.default_user_agent = lambda: APT_SYNC_USER_AGENT
# set preferred address family # set preferred address family
import requests.packages.urllib3.util.connection as urllib3_cn import requests.packages.urllib3.util.connection as urllib3_cn
USE_ADDR_FAMILY = os.getenv('USE_ADDR_FAMILY', '').strip().lower()
if USE_ADDR_FAMILY != '': USE_ADDR_FAMILY = os.getenv("USE_ADDR_FAMILY", "").strip().lower()
assert USE_ADDR_FAMILY in ['ipv4', 'ipv6'], "USE_ADDR_FAMILY must be either ipv4 or ipv6" if USE_ADDR_FAMILY != "":
urllib3_cn.allowed_gai_family = lambda: socket.AF_INET if USE_ADDR_FAMILY == 'ipv4' else socket.AF_INET6 assert USE_ADDR_FAMILY in [
"ipv4",
"ipv6",
], "USE_ADDR_FAMILY must be either ipv4 or ipv6"
urllib3_cn.allowed_gai_family = lambda: (
socket.AF_INET if USE_ADDR_FAMILY == "ipv4" else socket.AF_INET6
)
OS_TEMPLATE = { OS_TEMPLATE = {
'ubuntu-lts': ["focal", "jammy", "noble"], "ubuntu-lts": ["focal", "jammy", "noble"],
'debian-current': ["buster", "bullseye", "bookworm"], "debian-current": ["bullseye", "bookworm"],
'debian-latest2': ["bullseye", "bookworm"], "debian-latest2": ["bullseye", "bookworm"],
'debian-latest': ["bookworm"], "debian-latest": ["bookworm"],
} }
ARCH_NO_PKGIDX = ['dep11', 'i18n', 'cnf'] ARCH_NO_PKGIDX = ["dep11", "i18n", "cnf"]
MAX_RETRY=int(os.getenv('MAX_RETRY', '3')) MAX_RETRY = int(os.getenv("MAX_RETRY", "3"))
DOWNLOAD_TIMEOUT=int(os.getenv('DOWNLOAD_TIMEOUT', '1800')) DOWNLOAD_TIMEOUT = int(os.getenv("DOWNLOAD_TIMEOUT", "1800"))
REPO_SIZE_FILE = os.getenv('REPO_SIZE_FILE', '') REPO_SIZE_FILE = os.getenv("REPO_SIZE_FILE", "")
pattern_os_template = re.compile(r"@\{(.+)\}") pattern_os_template = re.compile(r"@\{(.+)\}")
pattern_package_name = re.compile(r"^Filename: (.+)$", re.MULTILINE) pattern_package_name = re.compile(r"^Filename: (.+)$", re.MULTILINE)
@ -45,11 +50,13 @@ pattern_package_size = re.compile(r"^Size: (\d+)$", re.MULTILINE)
pattern_package_sha256 = re.compile(r"^SHA256: (\w{64})$", re.MULTILINE) pattern_package_sha256 = re.compile(r"^SHA256: (\w{64})$", re.MULTILINE)
download_cache = dict() download_cache = dict()
def check_args(prop: str, lst: List[str]): def check_args(prop: str, lst: List[str]):
for s in lst: for s in lst:
if len(s)==0 or ' ' in s: if len(s) == 0 or " " in s:
raise ValueError(f"Invalid item in {prop}: {repr(s)}") raise ValueError(f"Invalid item in {prop}: {repr(s)}")
def replace_os_template(os_list: List[str]) -> List[str]: def replace_os_template(os_list: List[str]) -> List[str]:
ret = [] ret = []
for i in os_list: for i in os_list:
@ -57,46 +64,54 @@ def replace_os_template(os_list: List[str]) -> List[str]:
if matched: if matched:
for os in OS_TEMPLATE[matched.group(1)]: for os in OS_TEMPLATE[matched.group(1)]:
ret.append(pattern_os_template.sub(os, i)) ret.append(pattern_os_template.sub(os, i))
elif i.startswith('@'): elif i.startswith("@"):
ret.extend(OS_TEMPLATE[i[1:]]) ret.extend(OS_TEMPLATE[i[1:]])
else: else:
ret.append(i) ret.append(i)
return ret return ret
def check_and_download(url: str, dst_file: Path, caching=False) -> int: def check_and_download(url: str, dst_file: Path, caching=False) -> int:
try: try:
if caching: if caching:
if url in download_cache: if url in download_cache:
print(f"Using cached content: {url}", flush=True) print(f"Using cached content: {url}", flush=True)
with dst_file.open('wb') as f: with dst_file.open("wb") as f:
f.write(download_cache[url]) f.write(download_cache[url])
return 0 return 0
download_cache[url] = bytes() download_cache[url] = bytes()
start = time.time() start = time.time()
with requests.get(url, stream=True, timeout=(5, 10)) as r: with requests.get(url, stream=True, timeout=(5, 10)) as r:
r.raise_for_status() r.raise_for_status()
if 'last-modified' in r.headers: if "last-modified" in r.headers:
remote_ts = parsedate_to_datetime( remote_ts = parsedate_to_datetime(
r.headers['last-modified']).timestamp() r.headers["last-modified"]
else: remote_ts = None ).timestamp()
else:
remote_ts = None
with dst_file.open('wb') as f: with dst_file.open("wb") as f:
for chunk in r.iter_content(chunk_size=1024**2): for chunk in r.iter_content(chunk_size=1024**2):
if time.time() - start > DOWNLOAD_TIMEOUT: if time.time() - start > DOWNLOAD_TIMEOUT:
raise TimeoutError("Download timeout") raise TimeoutError("Download timeout")
if not chunk: continue # filter out keep-alive new chunks if not chunk:
continue # filter out keep-alive new chunks
f.write(chunk) f.write(chunk)
if caching: download_cache[url] += chunk if caching:
download_cache[url] += chunk
if remote_ts is not None: if remote_ts is not None:
os.utime(dst_file, (remote_ts, remote_ts)) os.utime(dst_file, (remote_ts, remote_ts))
return 0 return 0
except BaseException as e: except BaseException as e:
print(e, flush=True) print(e, flush=True)
if dst_file.is_file(): dst_file.unlink() if dst_file.is_file():
if url in download_cache: del download_cache[url] dst_file.unlink()
if url in download_cache:
del download_cache[url]
return 1 return 1
def mkdir_with_dot_tmp(folder: Path) -> Tuple[Path, Path]: def mkdir_with_dot_tmp(folder: Path) -> Tuple[Path, Path]:
tmpdir = folder / ".tmp" tmpdir = folder / ".tmp"
if tmpdir.is_dir(): if tmpdir.is_dir():
@ -104,9 +119,10 @@ def mkdir_with_dot_tmp(folder: Path)->Tuple[Path, Path]:
tmpdir.mkdir(parents=True, exist_ok=True) tmpdir.mkdir(parents=True, exist_ok=True)
return (folder, tmpdir) return (folder, tmpdir)
def move_files_in(src: Path, dst: Path): def move_files_in(src: Path, dst: Path):
empty = True empty = True
for file in src.glob('*'): for file in src.glob("*"):
empty = False empty = False
print(f"moving {file} to {dst}") print(f"moving {file} to {dst}")
# shutil.move(str(file), str(dst)) # shutil.move(str(file), str(dst))
@ -119,7 +135,17 @@ def move_files_in(src: Path, dst: Path):
if empty: if empty:
print(f"{src} is empty") print(f"{src} is empty")
def apt_mirror(base_url: str, dist: str, repo: str, arch: str, dest_base_dir: Path, deb_set: Dict[str, int], skip_checksum: bool)->int:
def apt_mirror(
base_url: str,
dist: str,
repo: str,
arch: str,
dest_base_dir: Path,
deb_set: Dict[str, int,
],
skip_checksum: bool
) -> int:
if not dest_base_dir.is_dir(): if not dest_base_dir.is_dir():
print("Destination directory is empty, cannot continue") print("Destination directory is empty, cannot continue")
return 1 return 1
@ -127,14 +153,27 @@ def apt_mirror(base_url: str, dist: str, repo: str, arch: str, dest_base_dir: Pa
# download Release files # download Release files
dist_dir, dist_tmp_dir = mkdir_with_dot_tmp(dest_base_dir / "dists" / dist) dist_dir, dist_tmp_dir = mkdir_with_dot_tmp(dest_base_dir / "dists" / dist)
check_and_download(f"{base_url}/dists/{dist}/InRelease",dist_tmp_dir / "InRelease", caching=True) check_and_download(
if check_and_download(f"{base_url}/dists/{dist}/Release",dist_tmp_dir / "Release", caching=True) != 0: f"{base_url}/dists/{dist}/InRelease", dist_tmp_dir / "InRelease", caching=True
)
if (
check_and_download(
f"{base_url}/dists/{dist}/Release", dist_tmp_dir / "Release", caching=True
)
!= 0
):
print("Invalid Repository") print("Invalid Repository")
if not (dist_dir / "Release").is_file(): if not (dist_dir / "Release").is_file():
print(f"{dist_dir/'Release'} never existed, upstream may not provide packages for {dist}, ignore this error") print(
f"{dist_dir/'Release'} never existed, upstream may not provide packages for {dist}, ignore this error"
)
return 0 return 0
return 1 return 1
check_and_download(f"{base_url}/dists/{dist}/Release.gpg",dist_tmp_dir / "Release.gpg", caching=True) check_and_download(
f"{base_url}/dists/{dist}/Release.gpg",
dist_tmp_dir / "Release.gpg",
caching=True,
)
comp_dir, comp_tmp_dir = mkdir_with_dot_tmp(dist_dir / repo) comp_dir, comp_tmp_dir = mkdir_with_dot_tmp(dist_dir / repo)
@ -148,12 +187,16 @@ def apt_mirror(base_url: str, dist: str, repo: str, arch: str, dest_base_dir: Pa
for line in fd: for line in fd:
if cnt_start: if cnt_start:
fields = line.split() fields = line.split()
if len(fields) != 3 or len(fields[0]) != 64: # 64 is SHA-256 checksum length if (
len(fields) != 3 or len(fields[0]) != 64
): # 64 is SHA-256 checksum length
break break
checksum, filesize, filename = tuple(fields) checksum, filesize, filename = tuple(fields)
if filename.startswith(f"{repo}/{arch_dir}/") or \ if (
filename.startswith(f"{repo}/Contents-{arch}") or \ filename.startswith(f"{repo}/{arch_dir}/")
filename.startswith(f"Contents-{arch}"): or filename.startswith(f"{repo}/Contents-{arch}")
or filename.startswith(f"Contents-{arch}")
):
fn = Path(filename) fn = Path(filename)
if len(fn.parts) <= 3: if len(fn.parts) <= 3:
# Contents-amd64.gz # Contents-amd64.gz
@ -163,7 +206,13 @@ def apt_mirror(base_url: str, dist: str, repo: str, arch: str, dest_base_dir: Pa
else: else:
# main/dep11/by-hash/MD5Sum/0af5c69679a24671cfd7579095a9cb5e # main/dep11/by-hash/MD5Sum/0af5c69679a24671cfd7579095a9cb5e
# deep_tmp_dir is in pkgidx_tmp_dir hence no extra garbage collection needed # deep_tmp_dir is in pkgidx_tmp_dir hence no extra garbage collection needed
deep_tmp_dir = dist_dir / Path(fn.parts[0]) / Path(fn.parts[1]) / ".tmp" / Path('/'.join(fn.parts[2:-1])) deep_tmp_dir = (
dist_dir
/ Path(fn.parts[0])
/ Path(fn.parts[1])
/ ".tmp"
/ Path("/".join(fn.parts[2:-1]))
)
deep_tmp_dir.mkdir(parents=True, exist_ok=True) deep_tmp_dir.mkdir(parents=True, exist_ok=True)
pkgidx_file = deep_tmp_dir / fn.name pkgidx_file = deep_tmp_dir / fn.name
else: else:
@ -174,33 +223,41 @@ def apt_mirror(base_url: str, dist: str, repo: str, arch: str, dest_base_dir: Pa
print("Failed to download:", pkglist_url) print("Failed to download:", pkglist_url)
continue continue
with pkgidx_file.open('rb') as t: content = t.read() with pkgidx_file.open("rb") as t:
content = t.read()
if len(content) != int(filesize): if len(content) != int(filesize):
print(f"Invalid size of {pkgidx_file}, expected {filesize}, skipped") print(
f"Invalid size of {pkgidx_file}, expected {filesize}, skipped"
)
pkgidx_file.unlink() pkgidx_file.unlink()
continue continue
if not skip_checksum and hashlib.sha256(content).hexdigest() != checksum: if not skip_checksum and hashlib.sha256(content).hexdigest() != checksum:
print(f"Invalid checksum of {pkgidx_file}, expected {checksum}, skipped") print(
f"Invalid checksum of {pkgidx_file}, expected {checksum}, skipped"
)
pkgidx_file.unlink() pkgidx_file.unlink()
continue continue
if pkgidx_content is None and pkgidx_file.stem == 'Packages': if pkgidx_content is None and pkgidx_file.stem == "Packages":
print(f"getting packages index content from {pkgidx_file.name}", flush=True) print(
f"getting packages index content from {pkgidx_file.name}",
flush=True,
)
suffix = pkgidx_file.suffix suffix = pkgidx_file.suffix
if suffix == '.xz': if suffix == ".xz":
pkgidx_content = lzma.decompress(content).decode('utf-8') pkgidx_content = lzma.decompress(content).decode("utf-8")
elif suffix == '.bz2': elif suffix == ".bz2":
pkgidx_content = bz2.decompress(content).decode('utf-8') pkgidx_content = bz2.decompress(content).decode("utf-8")
elif suffix == '.gz': elif suffix == ".gz":
pkgidx_content = gzip.decompress(content).decode('utf-8') pkgidx_content = gzip.decompress(content).decode("utf-8")
elif suffix == '': elif suffix == "":
pkgidx_content = content.decode('utf-8') pkgidx_content = content.decode("utf-8")
else: else:
print("unsupported format") print("unsupported format")
# Currently only support SHA-256 checksum, because # Currently only support SHA-256 checksum, because
# "Clients may not use the MD5Sum and SHA1 fields for security purposes, and must require a SHA256 or a SHA512 field." # "Clients may not use the MD5Sum and SHA1 fields for security purposes, and must require a SHA256 or a SHA512 field."
# from https://wiki.debian.org/DebianRepository/Format#A.22Release.22_files # from https://wiki.debian.org/DebianRepository/Format#A.22Release.22_files
if line.startswith('SHA256:'): if line.startswith("SHA256:"):
cnt_start = True cnt_start = True
if not cnt_start: if not cnt_start:
print("Cannot find SHA-256 checksum") print("Cannot find SHA-256 checksum")
@ -219,6 +276,7 @@ def apt_mirror(base_url: str, dist: str, repo: str, arch: str, dest_base_dir: Pa
except: except:
traceback.print_exc() traceback.print_exc()
return 1 return 1
if arch in ARCH_NO_PKGIDX: if arch in ARCH_NO_PKGIDX:
if collect_tmp_dir() == 1: if collect_tmp_dir() == 1:
return 1 return 1
@ -227,8 +285,10 @@ def apt_mirror(base_url: str, dist: str, repo: str, arch: str, dest_base_dir: Pa
if pkgidx_content is None: if pkgidx_content is None:
print("index is empty, failed") print("index is empty, failed")
if len(list(pkgidx_dir.glob('Packages*'))) == 0: if len(list(pkgidx_dir.glob("Packages*"))) == 0:
print(f"{pkgidx_dir/'Packages'} never existed, upstream may not provide {dist}/{repo}/{arch}, ignore this error") print(
f"{pkgidx_dir/'Packages'} never existed, upstream may not provide {dist}/{repo}/{arch}, ignore this error"
)
return 0 return 0
return 1 return 1
@ -236,7 +296,7 @@ def apt_mirror(base_url: str, dist: str, repo: str, arch: str, dest_base_dir: Pa
err = 0 err = 0
deb_count = 0 deb_count = 0
deb_size = 0 deb_size = 0
for pkg in pkgidx_content.split('\n\n'): for pkg in pkgidx_content.split("\n\n"):
if len(pkg) < 10: # ignore blanks if len(pkg) < 10: # ignore blanks
continue continue
try: try:
@ -255,14 +315,14 @@ def apt_mirror(base_url: str, dist: str, repo: str, arch: str, dest_base_dir: Pa
dest_dir = dest_filename.parent dest_dir = dest_filename.parent
if not dest_dir.is_dir(): if not dest_dir.is_dir():
dest_dir.mkdir(parents=True, exist_ok=True) dest_dir.mkdir(parents=True, exist_ok=True)
if dest_filename.suffix == '.deb': if dest_filename.suffix == ".deb":
deb_set[str(dest_filename.relative_to(dest_base_dir))] = pkg_size deb_set[str(dest_filename.relative_to(dest_base_dir))] = pkg_size
if dest_filename.is_file() and dest_filename.stat().st_size == pkg_size: if dest_filename.is_file() and dest_filename.stat().st_size == pkg_size:
print(f"Skipping {pkg_filename}, size {pkg_size}") print(f"Skipping {pkg_filename}, size {pkg_size}")
continue continue
pkg_url = f"{base_url}/{pkg_filename}" pkg_url = f"{base_url}/{pkg_filename}"
dest_tmp_filename = dest_filename.with_name('._syncing_.' + dest_filename.name) dest_tmp_filename = dest_filename.with_name("._syncing_." + dest_filename.name)
for retry in range(MAX_RETRY): for retry in range(MAX_RETRY):
print(f"downloading {pkg_url} to {dest_filename}", flush=True) print(f"downloading {pkg_url} to {dest_filename}", flush=True)
# break # dry run # break # dry run
@ -289,13 +349,18 @@ def apt_mirror(base_url: str, dist: str, repo: str, arch: str, dest_base_dir: Pa
print(f"{deb_count} packages, {deb_size} bytes in total", flush=True) print(f"{deb_count} packages, {deb_size} bytes in total", flush=True)
return err return err
def apt_delete_old_debs(dest_base_dir: Path, remote_set: Dict[str, int], dry_run: bool): def apt_delete_old_debs(dest_base_dir: Path, remote_set: Dict[str, int], dry_run: bool):
on_disk = set([ on_disk = set(
str(i.relative_to(dest_base_dir)) for i in dest_base_dir.glob('**/*.deb')]) [str(i.relative_to(dest_base_dir)) for i in dest_base_dir.glob("**/*.deb")]
)
deleting = on_disk - remote_set.keys() deleting = on_disk - remote_set.keys()
# print(on_disk) # print(on_disk)
# print(remote_set) # print(remote_set)
print(f"Deleting {len(deleting)} packages not in the index{' (dry run)' if dry_run else ''}", flush=True) print(
f"Deleting {len(deleting)} packages not in the index{' (dry run)' if dry_run else ''}",
flush=True,
)
for i in deleting: for i in deleting:
if dry_run: if dry_run:
print("Will delete", i) print("Will delete", i)
@ -303,6 +368,7 @@ def apt_delete_old_debs(dest_base_dir: Path, remote_set: Dict[str, int], dry_run
print("Deleting", i) print("Deleting", i)
(dest_base_dir / i).unlink() (dest_base_dir / i).unlink()
def main(): def main():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
@ -311,33 +377,37 @@ def main():
parser.add_argument("component", type=str, help="e.g. multiverse,contrib") parser.add_argument("component", type=str, help="e.g. multiverse,contrib")
parser.add_argument("arch", type=str, help="e.g. i386,amd64") parser.add_argument("arch", type=str, help="e.g. i386,amd64")
parser.add_argument("working_dir", type=Path, help="working directory") parser.add_argument("working_dir", type=Path, help="working directory")
parser.add_argument("--delete", action='store_true', parser.add_argument(
help='delete unreferenced package files') "--delete", action="store_true", help="delete unreferenced package files"
parser.add_argument("--delete-dry-run", action='store_true', )
help='print package files to be deleted only') parser.add_argument(
"--delete-dry-run",
action="store_true",
help="print package files to be deleted only",
)
parser.add_argument("--skip-checksum", action='store_true', parser.add_argument("--skip-checksum", action='store_true',
help='skip checksum validation') help='skip checksum validation')
args = parser.parse_args() args = parser.parse_args()
# generate lists of os codenames # generate lists of os codenames
os_list = args.os_version.split(',') os_list = args.os_version.split(",")
check_args("os_version", os_list) check_args("os_version", os_list)
os_list = replace_os_template(os_list) os_list = replace_os_template(os_list)
# generate a list of components and archs for each os codename # generate a list of components and archs for each os codename
def generate_list_for_oses(raw: str, name: str) -> List[List[str]]: def generate_list_for_oses(raw: str, name: str) -> List[List[str]]:
n_os = len(os_list) n_os = len(os_list)
if ':' in raw: if ":" in raw:
# specify os codenames for each component # specify os codenames for each component
lists = [] lists = []
for l in raw.split(':'): for l in raw.split(":"):
list_for_os = l.split(',') list_for_os = l.split(",")
check_args(name, list_for_os) check_args(name, list_for_os)
lists.append(list_for_os) lists.append(list_for_os)
assert len(lists) == n_os, f"{name} must be specified for each component" assert len(lists) == n_os, f"{name} must be specified for each component"
else: else:
# use same os codenames for all components # use same os codenames for all components
l = raw.split(',') l = raw.split(",")
check_args(name, l) check_args(name, l)
lists = [l] * n_os lists = [l] * n_os
return lists return lists
@ -352,7 +422,12 @@ def main():
for os, arch_list, comp_list in zip(os_list, arch_lists, component_lists): for os, arch_list, comp_list in zip(os_list, arch_lists, component_lists):
for comp in comp_list: for comp in comp_list:
for arch in arch_list: for arch in arch_list:
if apt_mirror(args.base_url, os, comp, arch, args.working_dir, deb_set=deb_set, skip_checksum=args.skip_checksum) != 0: if (
apt_mirror(
args.base_url, os, comp, arch, args.working_dir, deb_set=deb_set, skip_checksum=args.skip_checksum
)
!= 0
):
failed.append((os, comp, arch)) failed.append((os, comp, arch))
if len(failed) > 0: if len(failed) > 0:
print(f"Failed APT repos of {args.base_url}: ", failed) print(f"Failed APT repos of {args.base_url}: ", failed)
@ -365,5 +440,6 @@ def main():
total_size = sum(deb_set.values()) total_size = sum(deb_set.values())
fd.write(f"+{total_size}") fd.write(f"+{total_size}")
if __name__ == "__main__": if __name__ == "__main__":
main() main()