Run formatter on some Python scripts

Signed-off-by: Harry Chen <i@harrychen.xyz>
This commit is contained in:
Harry Chen 2024-08-17 13:33:31 +08:00
parent f8afa1f57c
commit 5f4bc1c260
No known key found for this signature in database
6 changed files with 449 additions and 258 deletions

View File

@ -1,17 +1,11 @@
#!/usr/bin/env python3
import hashlib
import traceback
import json
import os
import re
import shutil
import subprocess as sp
import tempfile
import argparse
import time
from email.utils import parsedate_to_datetime
from pathlib import Path
from typing import List, Set, Tuple, IO
from typing import Set
import requests
DOWNLOAD_TIMEOUT = int(os.getenv('DOWNLOAD_TIMEOUT', '1800'))

View File

@ -25,6 +25,7 @@ CONDA_CLOUD_BASE_URL = os.getenv("CONDA_COULD_URL", "https://conda.anaconda.org"
WORKING_DIR = os.getenv("TUNASYNC_WORKING_DIR")
# fmt: off
CONDA_REPOS = ("main", "free", "r", "msys2")
CONDA_ARCHES = (
"noarch", "linux-64", "linux-32", "linux-aarch64", "linux-armv6l", "linux-armv7l",
@ -72,6 +73,7 @@ CONDA_CLOUD_REPOS = (
EXCLUDED_PACKAGES = (
"pytorch-nightly", "pytorch-nightly-cpu", "ignite-nightly",
)
# fmt: on
# connect and read timeout value
TIMEOUT_OPTION = (7, 10)
@ -84,28 +86,31 @@ logging.basicConfig(
format="[%(asctime)s] [%(levelname)s] %(message)s",
)
def sizeof_fmt(num, suffix='iB'):
for unit in ['','K','M','G','T','P','E','Z']:
def sizeof_fmt(num, suffix="iB"):
for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]:
if abs(num) < 1024.0:
return "%3.2f%s%s" % (num, unit, suffix)
num /= 1024.0
return "%.2f%s%s" % (num, 'Y', suffix)
return "%.2f%s%s" % (num, "Y", suffix)
def md5_check(file: Path, md5: str = None):
m = hashlib.md5()
with file.open('rb') as f:
with file.open("rb") as f:
while True:
buf = f.read(1*1024*1024)
buf = f.read(1 * 1024 * 1024)
if not buf:
break
m.update(buf)
return m.hexdigest() == md5
def sha256_check(file: Path, sha256: str = None):
m = hashlib.sha256()
with file.open('rb') as f:
with file.open("rb") as f:
while True:
buf = f.read(1*1024*1024)
buf = f.read(1 * 1024 * 1024)
if not buf:
break
m.update(buf)
@ -113,34 +118,42 @@ def sha256_check(file: Path, sha256: str = None):
def curl_download(remote_url: str, dst_file: Path, sha256: str = None, md5: str = None):
sp.check_call([
"curl", "-o", str(dst_file),
"-sL", "--remote-time", "--show-error",
"--fail", "--retry", "10", "--speed-time", "15",
"--speed-limit", "5000", remote_url,
])
# fmt: off
sp.check_call(
[
"curl", "-o", str(dst_file),
"-sL", "--remote-time", "--show-error",
"--fail", "--retry", "10",
"--speed-time", "15",
"--speed-limit", "5000",
remote_url,
]
)
# fmt: on
if sha256 and (not sha256_check(dst_file, sha256)):
return "SHA256 mismatch"
if md5 and (not md5_check(dst_file, md5)):
return "MD5 mismatch"
def sync_repo(repo_url: str, local_dir: Path, tmpdir: Path, delete: bool, remove_legacy: bool):
def sync_repo(
repo_url: str, local_dir: Path, tmpdir: Path, delete: bool, remove_legacy: bool
):
logging.info("Start syncing {}".format(repo_url))
local_dir.mkdir(parents=True, exist_ok=True)
repodata_url = repo_url + '/repodata.json'
bz2_repodata_url = repo_url + '/repodata.json.bz2'
repodata_url = repo_url + "/repodata.json"
bz2_repodata_url = repo_url + "/repodata.json.bz2"
# https://github.com/conda/conda/issues/13256, from conda 24.1.x
zst_repodata_url = repo_url + '/repodata.json.zst'
zst_repodata_url = repo_url + "/repodata.json.zst"
# https://docs.conda.io/projects/conda-build/en/latest/release-notes.html
# "current_repodata.json" - like repodata.json, but only has the newest version of each file
current_repodata_url = repo_url + '/current_repodata.json'
current_repodata_url = repo_url + "/current_repodata.json"
tmp_repodata = tmpdir / "repodata.json"
tmp_bz2_repodata = tmpdir / "repodata.json.bz2"
tmp_zst_repodata = tmpdir / "repodata.json.zst"
tmp_current_repodata = tmpdir / 'current_repodata.json'
tmp_current_repodata = tmpdir / "current_repodata.json"
curl_download(repodata_url, tmp_repodata)
curl_download(bz2_repodata_url, tmp_bz2_repodata)
@ -158,31 +171,33 @@ def sync_repo(repo_url: str, local_dir: Path, tmpdir: Path, delete: bool, remove
remote_filelist = []
total_size = 0
legacy_packages = repodata['packages']
legacy_packages = repodata["packages"]
conda_packages = repodata.get("packages.conda", {})
if remove_legacy:
# https://github.com/anaconda/conda/blob/0dbf85e0546e0b0dc060c8265ec936591ccbe980/conda/core/subdir_data.py#L440-L442
use_legacy_packages = set(legacy_packages.keys()) - set(k[:-6] + ".tar.bz2" for k in conda_packages.keys())
use_legacy_packages = set(legacy_packages.keys()) - set(
k[:-6] + ".tar.bz2" for k in conda_packages.keys()
)
legacy_packages = {k: legacy_packages[k] for k in use_legacy_packages}
packages = {**legacy_packages, **conda_packages}
for filename, meta in packages.items():
if meta['name'] in EXCLUDED_PACKAGES:
if meta["name"] in EXCLUDED_PACKAGES:
continue
file_size = meta['size']
file_size = meta["size"]
# prefer sha256 over md5
sha256 = None
md5 = None
if 'sha256' in meta:
sha256 = meta['sha256']
elif 'md5' in meta:
md5 = meta['md5']
if "sha256" in meta:
sha256 = meta["sha256"]
elif "md5" in meta:
md5 = meta["md5"]
total_size += file_size
pkg_url = '/'.join([repo_url, filename])
pkg_url = "/".join([repo_url, filename])
dst_file = local_dir / filename
dst_file_wip = local_dir / ('.downloading.' + filename)
dst_file_wip = local_dir / (".downloading." + filename)
remote_filelist.append(dst_file)
if dst_file.is_file():
@ -202,7 +217,7 @@ def sync_repo(repo_url: str, local_dir: Path, tmpdir: Path, delete: bool, remove
if err is None:
dst_file_wip.rename(dst_file)
except sp.CalledProcessError:
err = 'CalledProcessError'
err = "CalledProcessError"
if err is None:
break
logging.error("Failed to download {}: {}".format(filename, err))
@ -223,11 +238,15 @@ def sync_repo(repo_url: str, local_dir: Path, tmpdir: Path, delete: bool, remove
tmp_current_repodata_gz_gened = False
if tmp_current_repodata.is_file():
if os.path.getsize(tmp_current_repodata) > GEN_METADATA_JSON_GZIP_THRESHOLD:
sp.check_call(["gzip", "--no-name", "--keep", "--", str(tmp_current_repodata)])
shutil.move(str(tmp_current_repodata) + ".gz", str(local_dir / "current_repodata.json.gz"))
sp.check_call(
["gzip", "--no-name", "--keep", "--", str(tmp_current_repodata)]
)
shutil.move(
str(tmp_current_repodata) + ".gz",
str(local_dir / "current_repodata.json.gz"),
)
tmp_current_repodata_gz_gened = True
shutil.move(str(tmp_current_repodata), str(
local_dir / "current_repodata.json"))
shutil.move(str(tmp_current_repodata), str(local_dir / "current_repodata.json"))
if not tmp_current_repodata_gz_gened:
# If the gzip file is not generated, remove the dangling gzip archive
Path(local_dir / "current_repodata.json.gz").unlink(missing_ok=True)
@ -235,9 +254,9 @@ def sync_repo(repo_url: str, local_dir: Path, tmpdir: Path, delete: bool, remove
if delete:
local_filelist = []
delete_count = 0
for i in local_dir.glob('*.tar.bz2'):
for i in local_dir.glob("*.tar.bz2"):
local_filelist.append(i)
for i in local_dir.glob('*.conda'):
for i in local_dir.glob("*.conda"):
local_filelist.append(i)
for i in set(local_filelist) - set(remote_filelist):
logging.info("Deleting {}".format(i))
@ -245,46 +264,53 @@ def sync_repo(repo_url: str, local_dir: Path, tmpdir: Path, delete: bool, remove
delete_count += 1
logging.info("{} files deleted".format(delete_count))
logging.info("{}: {} files, {} in total".format(
repodata_url, len(remote_filelist), sizeof_fmt(total_size)))
logging.info(
"{}: {} files, {} in total".format(
repodata_url, len(remote_filelist), sizeof_fmt(total_size)
)
)
return total_size
def sync_installer(repo_url, local_dir: Path):
logging.info("Start syncing {}".format(repo_url))
local_dir.mkdir(parents=True, exist_ok=True)
full_scan = random.random() < 0.1 # Do full version check less frequently
full_scan = random.random() < 0.1 # Do full version check less frequently
def remote_list():
r = requests.get(repo_url, timeout=TIMEOUT_OPTION)
d = pq(r.content)
for tr in d('table').find('tr'):
tds = pq(tr).find('td')
for tr in d("table").find("tr"):
tds = pq(tr).find("td")
if len(tds) != 4:
continue
fname = tds[0].find('a').text
fname = tds[0].find("a").text
sha256 = tds[3].text
if sha256 == '<directory>' or len(sha256) != 64:
if sha256 == "<directory>" or len(sha256) != 64:
continue
yield (fname, sha256)
for filename, sha256 in remote_list():
pkg_url = "/".join([repo_url, filename])
dst_file = local_dir / filename
dst_file_wip = local_dir / ('.downloading.' + filename)
dst_file_wip = local_dir / (".downloading." + filename)
if dst_file.is_file():
r = requests.head(pkg_url, allow_redirects=True, timeout=TIMEOUT_OPTION)
len_avail = 'content-length' in r.headers
len_avail = "content-length" in r.headers
if len_avail:
remote_filesize = int(r.headers['content-length'])
remote_date = parsedate_to_datetime(r.headers['last-modified'])
remote_filesize = int(r.headers["content-length"])
remote_date = parsedate_to_datetime(r.headers["last-modified"])
stat = dst_file.stat()
local_filesize = stat.st_size
local_mtime = stat.st_mtime
# Do content verification on ~5% of files (see issue #25)
if (not len_avail or remote_filesize == local_filesize) and remote_date.timestamp() == local_mtime and \
(random.random() < 0.95 or sha256_check(dst_file, sha256)):
if (
(not len_avail or remote_filesize == local_filesize)
and remote_date.timestamp() == local_mtime
and (random.random() < 0.95 or sha256_check(dst_file, sha256))
):
logging.info("Skipping {}".format(filename))
# Stop the scanning if the most recent version is present
@ -299,25 +325,31 @@ def sync_installer(repo_url, local_dir: Path):
for retry in range(3):
logging.info("Downloading {}".format(filename))
err = ''
err = ""
try:
err = curl_download(pkg_url, dst_file_wip, sha256=sha256)
if err is None:
dst_file_wip.rename(dst_file)
except sp.CalledProcessError:
err = 'CalledProcessError'
err = "CalledProcessError"
if err is None:
break
logging.error("Failed to download {}: {}".format(filename, err))
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--working-dir", default=WORKING_DIR)
parser.add_argument("--delete", action='store_true',
help='delete unreferenced package files')
parser.add_argument("--remove-legacy", action='store_true',
help='delete legacy packages which have conda counterpart. Requires client conda >= 4.7.0')
parser.add_argument(
"--delete", action="store_true", help="delete unreferenced package files"
)
parser.add_argument(
"--remove-legacy",
action="store_true",
help="delete legacy packages which have conda counterpart. Requires client conda >= 4.7.0",
)
args = parser.parse_args()
if args.working_dir is None:
@ -336,7 +368,8 @@ def main():
try:
sync_installer(remote_url, local_dir)
size_statistics += sum(
f.stat().st_size for f in local_dir.glob('*') if f.is_file())
f.stat().st_size for f in local_dir.glob("*") if f.is_file()
)
except Exception:
logging.exception("Failed to sync installers of {}".format(dist))
success = False
@ -348,8 +381,9 @@ def main():
tmpdir = tempfile.mkdtemp()
try:
size_statistics += sync_repo(remote_url,
local_dir, Path(tmpdir), args.delete, args.remove_legacy)
size_statistics += sync_repo(
remote_url, local_dir, Path(tmpdir), args.delete, args.remove_legacy
)
except Exception:
logging.exception("Failed to sync repo: {}/{}".format(repo, arch))
success = False
@ -362,8 +396,9 @@ def main():
tmpdir = tempfile.mkdtemp()
try:
size_statistics += sync_repo(remote_url,
local_dir, Path(tmpdir), args.delete, args.remove_legacy)
size_statistics += sync_repo(
remote_url, local_dir, Path(tmpdir), args.delete, args.remove_legacy
)
except Exception:
logging.exception("Failed to sync repo: {}".format(repo))
success = False
@ -374,6 +409,7 @@ def main():
if not success:
sys.exit(1)
if __name__ == "__main__":
main()

View File

@ -4,7 +4,6 @@ import traceback
import os
import re
import shutil
import subprocess as sp
import argparse
import bz2
import gzip
@ -23,21 +22,27 @@ requests.utils.default_user_agent = lambda: APT_SYNC_USER_AGENT
# set preferred address family
import requests.packages.urllib3.util.connection as urllib3_cn
USE_ADDR_FAMILY = os.getenv('USE_ADDR_FAMILY', '').strip().lower()
if USE_ADDR_FAMILY != '':
assert USE_ADDR_FAMILY in ['ipv4', 'ipv6'], "USE_ADDR_FAMILY must be either ipv4 or ipv6"
urllib3_cn.allowed_gai_family = lambda: socket.AF_INET if USE_ADDR_FAMILY == 'ipv4' else socket.AF_INET6
USE_ADDR_FAMILY = os.getenv("USE_ADDR_FAMILY", "").strip().lower()
if USE_ADDR_FAMILY != "":
assert USE_ADDR_FAMILY in [
"ipv4",
"ipv6",
], "USE_ADDR_FAMILY must be either ipv4 or ipv6"
urllib3_cn.allowed_gai_family = lambda: (
socket.AF_INET if USE_ADDR_FAMILY == "ipv4" else socket.AF_INET6
)
OS_TEMPLATE = {
'ubuntu-lts': ["focal", "jammy", "noble"],
'debian-current': ["bullseye", "bookworm"],
'debian-latest2': ["bullseye", "bookworm"],
'debian-latest': ["bookworm"],
"ubuntu-lts": ["focal", "jammy", "noble"],
"debian-current": ["bullseye", "bookworm"],
"debian-latest2": ["bullseye", "bookworm"],
"debian-latest": ["bookworm"],
}
ARCH_NO_PKGIDX = ['dep11', 'i18n', 'cnf']
MAX_RETRY=int(os.getenv('MAX_RETRY', '3'))
DOWNLOAD_TIMEOUT=int(os.getenv('DOWNLOAD_TIMEOUT', '1800'))
REPO_SIZE_FILE = os.getenv('REPO_SIZE_FILE', '')
ARCH_NO_PKGIDX = ["dep11", "i18n", "cnf"]
MAX_RETRY = int(os.getenv("MAX_RETRY", "3"))
DOWNLOAD_TIMEOUT = int(os.getenv("DOWNLOAD_TIMEOUT", "1800"))
REPO_SIZE_FILE = os.getenv("REPO_SIZE_FILE", "")
pattern_os_template = re.compile(r"@\{(.+)\}")
pattern_package_name = re.compile(r"^Filename: (.+)$", re.MULTILINE)
@ -45,11 +50,13 @@ pattern_package_size = re.compile(r"^Size: (\d+)$", re.MULTILINE)
pattern_package_sha256 = re.compile(r"^SHA256: (\w{64})$", re.MULTILINE)
download_cache = dict()
def check_args(prop: str, lst: List[str]):
for s in lst:
if len(s)==0 or ' ' in s:
if len(s) == 0 or " " in s:
raise ValueError(f"Invalid item in {prop}: {repr(s)}")
def replace_os_template(os_list: List[str]) -> List[str]:
ret = []
for i in os_list:
@ -57,103 +64,137 @@ def replace_os_template(os_list: List[str]) -> List[str]:
if matched:
for os in OS_TEMPLATE[matched.group(1)]:
ret.append(pattern_os_template.sub(os, i))
elif i.startswith('@'):
elif i.startswith("@"):
ret.extend(OS_TEMPLATE[i[1:]])
else:
ret.append(i)
return ret
def check_and_download(url: str, dst_file: Path, caching = False)->int:
def check_and_download(url: str, dst_file: Path, caching=False) -> int:
try:
if caching:
if url in download_cache:
print(f"Using cached content: {url}", flush=True)
with dst_file.open('wb') as f:
with dst_file.open("wb") as f:
f.write(download_cache[url])
return 0
download_cache[url] = bytes()
start = time.time()
with requests.get(url, stream=True, timeout=(5, 10)) as r:
r.raise_for_status()
if 'last-modified' in r.headers:
if "last-modified" in r.headers:
remote_ts = parsedate_to_datetime(
r.headers['last-modified']).timestamp()
else: remote_ts = None
r.headers["last-modified"]
).timestamp()
else:
remote_ts = None
with dst_file.open('wb') as f:
with dst_file.open("wb") as f:
for chunk in r.iter_content(chunk_size=1024**2):
if time.time() - start > DOWNLOAD_TIMEOUT:
raise TimeoutError("Download timeout")
if not chunk: continue # filter out keep-alive new chunks
if not chunk:
continue # filter out keep-alive new chunks
f.write(chunk)
if caching: download_cache[url] += chunk
if caching:
download_cache[url] += chunk
if remote_ts is not None:
os.utime(dst_file, (remote_ts, remote_ts))
return 0
except BaseException as e:
print(e, flush=True)
if dst_file.is_file(): dst_file.unlink()
if url in download_cache: del download_cache[url]
if dst_file.is_file():
dst_file.unlink()
if url in download_cache:
del download_cache[url]
return 1
def mkdir_with_dot_tmp(folder: Path)->Tuple[Path, Path]:
def mkdir_with_dot_tmp(folder: Path) -> Tuple[Path, Path]:
tmpdir = folder / ".tmp"
if tmpdir.is_dir():
shutil.rmtree(str(tmpdir))
tmpdir.mkdir(parents=True, exist_ok=True)
return (folder, tmpdir)
def move_files_in(src: Path, dst: Path):
empty = True
for file in src.glob('*'):
for file in src.glob("*"):
empty = False
print(f"moving {file} to {dst}")
# shutil.move(str(file), str(dst))
if file.is_dir():
(dst / file.name).mkdir(parents=True, exist_ok=True)
move_files_in(file, dst / file.name)
file.rmdir() # rmdir wont fail as all files in it have been moved
file.rmdir() # rmdir wont fail as all files in it have been moved
else:
file.rename(dst / file.name) # Overwrite files
file.rename(dst / file.name) # Overwrite files
if empty:
print(f"{src} is empty")
def apt_mirror(base_url: str, dist: str, repo: str, arch: str, dest_base_dir: Path, deb_set: Dict[str, int])->int:
def apt_mirror(
base_url: str,
dist: str,
repo: str,
arch: str,
dest_base_dir: Path,
deb_set: Dict[str, int],
) -> int:
if not dest_base_dir.is_dir():
print("Destination directory is empty, cannot continue")
return 1
print(f"Started mirroring {base_url} {dist}, {repo}, {arch}!", flush=True)
# download Release files
dist_dir,dist_tmp_dir = mkdir_with_dot_tmp(dest_base_dir / "dists" / dist)
check_and_download(f"{base_url}/dists/{dist}/InRelease",dist_tmp_dir / "InRelease", caching=True)
if check_and_download(f"{base_url}/dists/{dist}/Release",dist_tmp_dir / "Release", caching=True) != 0:
# download Release files
dist_dir, dist_tmp_dir = mkdir_with_dot_tmp(dest_base_dir / "dists" / dist)
check_and_download(
f"{base_url}/dists/{dist}/InRelease", dist_tmp_dir / "InRelease", caching=True
)
if (
check_and_download(
f"{base_url}/dists/{dist}/Release", dist_tmp_dir / "Release", caching=True
)
!= 0
):
print("Invalid Repository")
if not (dist_dir/"Release").is_file():
print(f"{dist_dir/'Release'} never existed, upstream may not provide packages for {dist}, ignore this error")
if not (dist_dir / "Release").is_file():
print(
f"{dist_dir/'Release'} never existed, upstream may not provide packages for {dist}, ignore this error"
)
return 0
return 1
check_and_download(f"{base_url}/dists/{dist}/Release.gpg",dist_tmp_dir / "Release.gpg", caching=True)
check_and_download(
f"{base_url}/dists/{dist}/Release.gpg",
dist_tmp_dir / "Release.gpg",
caching=True,
)
comp_dir,comp_tmp_dir = mkdir_with_dot_tmp(dist_dir / repo)
comp_dir, comp_tmp_dir = mkdir_with_dot_tmp(dist_dir / repo)
# load Package Index URLs from the Release file
# load Package Index URLs from the Release file
release_file = dist_tmp_dir / "Release"
arch_dir = arch if arch in ARCH_NO_PKGIDX else f"binary-{arch}"
pkgidx_dir,pkgidx_tmp_dir = mkdir_with_dot_tmp(comp_dir / arch_dir)
pkgidx_dir, pkgidx_tmp_dir = mkdir_with_dot_tmp(comp_dir / arch_dir)
with open(release_file, "r") as fd:
pkgidx_content=None
cnt_start=False
pkgidx_content = None
cnt_start = False
for line in fd:
if cnt_start:
fields = line.split()
if len(fields) != 3 or len(fields[0]) != 64: # 64 is SHA-256 checksum length
if (
len(fields) != 3 or len(fields[0]) != 64
): # 64 is SHA-256 checksum length
break
checksum, filesize, filename = tuple(fields)
if filename.startswith(f"{repo}/{arch_dir}/") or \
filename.startswith(f"{repo}/Contents-{arch}") or \
filename.startswith(f"Contents-{arch}"):
if (
filename.startswith(f"{repo}/{arch_dir}/")
or filename.startswith(f"{repo}/Contents-{arch}")
or filename.startswith(f"Contents-{arch}")
):
fn = Path(filename)
if len(fn.parts) <= 3:
# Contents-amd64.gz
@ -163,7 +204,13 @@ def apt_mirror(base_url: str, dist: str, repo: str, arch: str, dest_base_dir: Pa
else:
# main/dep11/by-hash/MD5Sum/0af5c69679a24671cfd7579095a9cb5e
# deep_tmp_dir is in pkgidx_tmp_dir hence no extra garbage collection needed
deep_tmp_dir = dist_dir / Path(fn.parts[0]) / Path(fn.parts[1]) / ".tmp" / Path('/'.join(fn.parts[2:-1]))
deep_tmp_dir = (
dist_dir
/ Path(fn.parts[0])
/ Path(fn.parts[1])
/ ".tmp"
/ Path("/".join(fn.parts[2:-1]))
)
deep_tmp_dir.mkdir(parents=True, exist_ok=True)
pkgidx_file = deep_tmp_dir / fn.name
else:
@ -174,33 +221,41 @@ def apt_mirror(base_url: str, dist: str, repo: str, arch: str, dest_base_dir: Pa
print("Failed to download:", pkglist_url)
continue
with pkgidx_file.open('rb') as t: content = t.read()
with pkgidx_file.open("rb") as t:
content = t.read()
if len(content) != int(filesize):
print(f"Invalid size of {pkgidx_file}, expected {filesize}, skipped")
print(
f"Invalid size of {pkgidx_file}, expected {filesize}, skipped"
)
pkgidx_file.unlink()
continue
if hashlib.sha256(content).hexdigest() != checksum:
print(f"Invalid checksum of {pkgidx_file}, expected {checksum}, skipped")
print(
f"Invalid checksum of {pkgidx_file}, expected {checksum}, skipped"
)
pkgidx_file.unlink()
continue
if pkgidx_content is None and pkgidx_file.stem == 'Packages':
print(f"getting packages index content from {pkgidx_file.name}", flush=True)
if pkgidx_content is None and pkgidx_file.stem == "Packages":
print(
f"getting packages index content from {pkgidx_file.name}",
flush=True,
)
suffix = pkgidx_file.suffix
if suffix == '.xz':
pkgidx_content = lzma.decompress(content).decode('utf-8')
elif suffix == '.bz2':
pkgidx_content = bz2.decompress(content).decode('utf-8')
elif suffix == '.gz':
pkgidx_content = gzip.decompress(content).decode('utf-8')
elif suffix == '':
pkgidx_content = content.decode('utf-8')
if suffix == ".xz":
pkgidx_content = lzma.decompress(content).decode("utf-8")
elif suffix == ".bz2":
pkgidx_content = bz2.decompress(content).decode("utf-8")
elif suffix == ".gz":
pkgidx_content = gzip.decompress(content).decode("utf-8")
elif suffix == "":
pkgidx_content = content.decode("utf-8")
else:
print("unsupported format")
# Currently only support SHA-256 checksum, because
# "Clients may not use the MD5Sum and SHA1 fields for security purposes, and must require a SHA256 or a SHA512 field."
# from https://wiki.debian.org/DebianRepository/Format#A.22Release.22_files
if line.startswith('SHA256:'):
if line.startswith("SHA256:"):
cnt_start = True
if not cnt_start:
print("Cannot find SHA-256 checksum")
@ -219,6 +274,7 @@ def apt_mirror(base_url: str, dist: str, repo: str, arch: str, dest_base_dir: Pa
except:
traceback.print_exc()
return 1
if arch in ARCH_NO_PKGIDX:
if collect_tmp_dir() == 1:
return 1
@ -227,8 +283,10 @@ def apt_mirror(base_url: str, dist: str, repo: str, arch: str, dest_base_dir: Pa
if pkgidx_content is None:
print("index is empty, failed")
if len(list(pkgidx_dir.glob('Packages*'))) == 0:
print(f"{pkgidx_dir/'Packages'} never existed, upstream may not provide {dist}/{repo}/{arch}, ignore this error")
if len(list(pkgidx_dir.glob("Packages*"))) == 0:
print(
f"{pkgidx_dir/'Packages'} never existed, upstream may not provide {dist}/{repo}/{arch}, ignore this error"
)
return 0
return 1
@ -236,8 +294,8 @@ def apt_mirror(base_url: str, dist: str, repo: str, arch: str, dest_base_dir: Pa
err = 0
deb_count = 0
deb_size = 0
for pkg in pkgidx_content.split('\n\n'):
if len(pkg) < 10: # ignore blanks
for pkg in pkgidx_content.split("\n\n"):
if len(pkg) < 10: # ignore blanks
continue
try:
pkg_filename = pattern_package_name.search(pkg).group(1)
@ -255,14 +313,14 @@ def apt_mirror(base_url: str, dist: str, repo: str, arch: str, dest_base_dir: Pa
dest_dir = dest_filename.parent
if not dest_dir.is_dir():
dest_dir.mkdir(parents=True, exist_ok=True)
if dest_filename.suffix == '.deb':
if dest_filename.suffix == ".deb":
deb_set[str(dest_filename.relative_to(dest_base_dir))] = pkg_size
if dest_filename.is_file() and dest_filename.stat().st_size == pkg_size:
print(f"Skipping {pkg_filename}, size {pkg_size}")
continue
pkg_url=f"{base_url}/{pkg_filename}"
dest_tmp_filename = dest_filename.with_name('._syncing_.' + dest_filename.name)
pkg_url = f"{base_url}/{pkg_filename}"
dest_tmp_filename = dest_filename.with_name("._syncing_." + dest_filename.name)
for retry in range(MAX_RETRY):
print(f"downloading {pkg_url} to {dest_filename}", flush=True)
# break # dry run
@ -289,19 +347,25 @@ def apt_mirror(base_url: str, dist: str, repo: str, arch: str, dest_base_dir: Pa
print(f"{deb_count} packages, {deb_size} bytes in total", flush=True)
return err
def apt_delete_old_debs(dest_base_dir: Path, remote_set: Dict[str, int], dry_run: bool):
on_disk = set([
str(i.relative_to(dest_base_dir)) for i in dest_base_dir.glob('**/*.deb')])
on_disk = set(
[str(i.relative_to(dest_base_dir)) for i in dest_base_dir.glob("**/*.deb")]
)
deleting = on_disk - remote_set.keys()
# print(on_disk)
# print(remote_set)
print(f"Deleting {len(deleting)} packages not in the index{' (dry run)' if dry_run else ''}", flush=True)
print(
f"Deleting {len(deleting)} packages not in the index{' (dry run)' if dry_run else ''}",
flush=True,
)
for i in deleting:
if dry_run:
print("Will delete", i)
else:
print("Deleting", i)
(dest_base_dir/i).unlink()
(dest_base_dir / i).unlink()
def main():
@ -311,31 +375,35 @@ def main():
parser.add_argument("component", type=str, help="e.g. multiverse,contrib")
parser.add_argument("arch", type=str, help="e.g. i386,amd64")
parser.add_argument("working_dir", type=Path, help="working directory")
parser.add_argument("--delete", action='store_true',
help='delete unreferenced package files')
parser.add_argument("--delete-dry-run", action='store_true',
help='print package files to be deleted only')
parser.add_argument(
"--delete", action="store_true", help="delete unreferenced package files"
)
parser.add_argument(
"--delete-dry-run",
action="store_true",
help="print package files to be deleted only",
)
args = parser.parse_args()
# generate lists of os codenames
os_list = args.os_version.split(',')
os_list = args.os_version.split(",")
check_args("os_version", os_list)
os_list = replace_os_template(os_list)
# generate a list of components and archs for each os codename
def generate_list_for_oses(raw: str, name: str) -> List[List[str]]:
n_os = len(os_list)
if ':' in raw:
if ":" in raw:
# specify os codenames for each component
lists = []
for l in raw.split(':'):
list_for_os = l.split(',')
for l in raw.split(":"):
list_for_os = l.split(",")
check_args(name, list_for_os)
lists.append(list_for_os)
assert len(lists) == n_os, f"{name} must be specified for each component"
else:
# use same os codenames for all components
l = raw.split(',')
l = raw.split(",")
check_args(name, l)
lists = [l] * n_os
return lists
@ -350,7 +418,12 @@ def main():
for os, arch_list, comp_list in zip(os_list, arch_lists, component_lists):
for comp in comp_list:
for arch in arch_list:
if apt_mirror(args.base_url, os, comp, arch, args.working_dir, deb_set=deb_set) != 0:
if (
apt_mirror(
args.base_url, os, comp, arch, args.working_dir, deb_set=deb_set
)
!= 0
):
failed.append((os, comp, arch))
if len(failed) > 0:
print(f"Failed APT repos of {args.base_url}: ", failed)
@ -363,5 +436,6 @@ def main():
total_size = sum(deb_set.values())
fd.write(f"+{total_size}")
if __name__ == "__main__":
main()

View File

@ -1,36 +1,46 @@
#!/usr/bin/env python3
import os
import sys
import threading
import traceback
import queue
from pathlib import Path
from datetime import datetime
import tempfile
import hashlib
import requests
BASE_URL = os.getenv("TUNASYNC_UPSTREAM_URL", "https://api.github.com/repos/")
WORKING_DIR = os.getenv("TUNASYNC_WORKING_DIR")
MIRROR_BASE_URL = os.getenv("MIRROR_BASE_URL", 'https://mirrors.tuna.tsinghua.edu.cn/github-raw/')
MIRROR_BASE_URL = os.getenv(
"MIRROR_BASE_URL", "https://mirrors.tuna.tsinghua.edu.cn/github-raw/"
)
def raw_to_mirror(s: str) -> str:
return s.replace("https://raw.githubusercontent.com/",
MIRROR_BASE_URL)
return s.replace("https://raw.githubusercontent.com/", MIRROR_BASE_URL)
def delete_line_with(w: str, s: str) -> str:
return "\n".join(list(filter(lambda x: x.count(w) == 0, s.splitlines())))
def delete_line_with_gbpdistro(s: str) -> str:
return delete_line_with("gbpdistro", s)
REPOS = [
# owner/repo, tree, tree, tree, blob
## for stackage
["fpco/stackage-content", "master", "stack", "global-hints.yaml"],
## for rosdep
{ "path": ["ros/rosdistro", "master", "rosdep", "sources.list.d", "20-default.list"], "filter": [ raw_to_mirror, delete_line_with_gbpdistro ] },
{
"path": [
"ros/rosdistro",
"master",
"rosdep",
"sources.list.d",
"20-default.list",
],
"filter": [raw_to_mirror, delete_line_with_gbpdistro],
},
["ros/rosdistro", "master", "rosdep", "osx-homebrew.yaml"],
["ros/rosdistro", "master", "rosdep", "base.yaml"],
["ros/rosdistro", "master", "rosdep", "python.yaml"],
@ -44,36 +54,46 @@ REPOS = [
TIMEOUT_OPTION = (7, 10)
total_size = 0
# wrap around requests.get to use token if available
def github_get(*args, **kwargs):
headers = kwargs['headers'] if 'headers' in kwargs else {}
if 'GITHUB_TOKEN' in os.environ:
headers['Authorization'] = 'token {}'.format(
os.environ['GITHUB_TOKEN'])
kwargs['headers'] = headers
headers = kwargs["headers"] if "headers" in kwargs else {}
if "GITHUB_TOKEN" in os.environ:
headers["Authorization"] = "token {}".format(os.environ["GITHUB_TOKEN"])
kwargs["headers"] = headers
return requests.get(*args, **kwargs)
def github_tree(*args, **kwargs):
headers = kwargs['headers'] if 'headers' in kwargs else {}
headers = kwargs["headers"] if "headers" in kwargs else {}
headers["Accept"] = "application/vnd.github.v3+json"
kwargs['headers'] = headers
kwargs["headers"] = headers
return github_get(*args, **kwargs)
# NOTE blob API supports file up to 100MB
# To get larger one, we need raw.githubcontent, which is not implemented now
def github_blob(*args, **kwargs):
headers = kwargs['headers'] if 'headers' in kwargs else {}
headers = kwargs["headers"] if "headers" in kwargs else {}
headers["Accept"] = "application/vnd.github.v3.raw"
kwargs['headers'] = headers
kwargs["headers"] = headers
return github_get(*args, **kwargs)
def do_download(remote_url: str, dst_file: Path, remote_size: int, sha: str, filter=None):
def do_download(
remote_url: str, dst_file: Path, remote_size: int, sha: str, filter=None
):
# NOTE the stream=True parameter below
with github_blob(remote_url, stream=True) as r:
r.raise_for_status()
tmp_dst_file = None
try:
with tempfile.NamedTemporaryFile(prefix="." + dst_file.name + ".", suffix=".tmp", dir=dst_file.parent, delete=False) as f:
with tempfile.NamedTemporaryFile(
prefix="." + dst_file.name + ".",
suffix=".tmp",
dir=dst_file.parent,
delete=False,
) as f:
tmp_dst_file = Path(f.name)
for chunk in r.iter_content(chunk_size=1024**2):
if chunk: # filter out keep-alive new chunks
@ -82,7 +102,9 @@ def do_download(remote_url: str, dst_file: Path, remote_size: int, sha: str, fil
# check for downloaded size
downloaded_size = tmp_dst_file.stat().st_size
if remote_size != -1 and downloaded_size != remote_size:
raise Exception(f'File {dst_file.as_posix()} size mismatch: downloaded {downloaded_size} bytes, expected {remote_size} bytes')
raise Exception(
f"File {dst_file.as_posix()} size mismatch: downloaded {downloaded_size} bytes, expected {remote_size} bytes"
)
if filter != None:
with open(tmp_dst_file, "r+") as f:
s = f.read()
@ -108,25 +130,26 @@ def do_download(remote_url: str, dst_file: Path, remote_size: int, sha: str, fil
if tmp_dst_file.is_file():
tmp_dst_file.unlink()
def downloading_worker(q):
while True:
item = q.get()
if item is None:
break
filter = item.pop(0) # remove filter
filter = item.pop(0) # remove filter
dst_file = Path('/'.join(item))
dst_file = Path("/".join(item))
dst_file.parent.mkdir(parents=True, exist_ok=True)
item.pop(0) # remove working dir
item.pop(0) # remove working dir
owner_repo = item.pop(0)
try:
tree = item.pop(0)
tree_child = item.pop(0)
child_is_leaf = False
url = ''
sha = ''
url = ""
sha = ""
size = 0
while not child_is_leaf:
with github_tree(f"{BASE_URL}{owner_repo}/git/trees/{tree}") as r:
@ -147,8 +170,7 @@ def downloading_worker(q):
break
else:
raise Exception
if not dst_file.is_symlink() or \
Path(os.readlink(dst_file)).name != sha:
if not dst_file.is_symlink() or Path(os.readlink(dst_file)).name != sha:
do_download(url, dst_file, size, sha, filter)
else:
print("Skip", dst_file)
@ -164,16 +186,19 @@ def downloading_worker(q):
def create_workers(n):
task_queue = queue.Queue()
for i in range(n):
t = threading.Thread(target=downloading_worker, args=(task_queue, ))
t = threading.Thread(target=downloading_worker, args=(task_queue,))
t.start()
return task_queue
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--working-dir", default=WORKING_DIR)
parser.add_argument("--workers", default=1, type=int,
help='number of concurrent downloading jobs')
parser.add_argument(
"--workers", default=1, type=int, help="number of concurrent downloading jobs"
)
args = parser.parse_args()
if args.working_dir is None:
@ -198,6 +223,7 @@ def main():
for i in range(args.workers):
task_queue.put(None)
if __name__ == "__main__":
main()

View File

@ -10,25 +10,30 @@ from pathlib import Path
# mainly from apt-sync.py
FORMULAE_BREW_SH_GITHUB_ACTIONS_ARTIFACT_API = os.getenv("TUNASYNC_UPSTREAM_URL", "https://api.github.com/repos/Homebrew/formulae.brew.sh/actions/artifacts?name=github-pages")
FORMULAE_BREW_SH_GITHUB_ACTIONS_ARTIFACT_API = os.getenv(
"TUNASYNC_UPSTREAM_URL",
"https://api.github.com/repos/Homebrew/formulae.brew.sh/actions/artifacts?name=github-pages",
)
WORKING_DIR = Path(os.getenv("TUNASYNC_WORKING_DIR", "/data"))
DOWNLOAD_TIMEOUT=int(os.getenv('DOWNLOAD_TIMEOUT', '1800'))
DOWNLOAD_TIMEOUT = int(os.getenv("DOWNLOAD_TIMEOUT", "1800"))
github_api_headers = {
"Accept": "application/vnd.github+json",
"X-GitHub-Api-Version": "2022-11-28",
}
if 'GITHUB_TOKEN' in os.environ:
github_api_headers['Authorization'] = 'token {}'.format(
os.environ['GITHUB_TOKEN'])
if "GITHUB_TOKEN" in os.environ:
github_api_headers["Authorization"] = "token {}".format(os.environ["GITHUB_TOKEN"])
else:
# https://github.com/actions/upload-artifact/issues/51
# the token should have 'public_repo' access
raise Exception("GITHUB_TOKEN is required")
def formulae_github_pages(zip_file: Path, unzip_directory: Path, tar_directory: Path):
artifacts = requests.get(FORMULAE_BREW_SH_GITHUB_ACTIONS_ARTIFACT_API, headers=github_api_headers)
artifacts = requests.get(
FORMULAE_BREW_SH_GITHUB_ACTIONS_ARTIFACT_API, headers=github_api_headers
)
artifacts.raise_for_status()
artifacts = artifacts.json()
latest = None
@ -40,7 +45,10 @@ def formulae_github_pages(zip_file: Path, unzip_directory: Path, tar_directory:
check_and_download(zip_url, zip_file, zip_file, github_api_headers)
sp.run(["unzip", str(zip_file), "-d", str(unzip_directory)])
sp.run(["tar", "-C", str(tar_directory), "-xf", str(unzip_directory / "artifact.tar")])
sp.run(
["tar", "-C", str(tar_directory), "-xf", str(unzip_directory / "artifact.tar")]
)
def bottles(formula_file: Path):
b = {}
@ -49,7 +57,7 @@ def bottles(formula_file: Path):
for formula in formulae:
if formula["versions"]["bottle"] and "stable" in formula["bottle"]:
bs = formula["bottle"]["stable"]
for (platform, v) in bs["files"].items():
for platform, v in bs["files"].items():
sha256 = v["sha256"]
url = v["url"]
name = formula["name"]
@ -63,28 +71,36 @@ def bottles(formula_file: Path):
}
return b
ghcr_headers = {
"Accept": "application/vnd.oci.image.index.v1+json",
"Authorization": "Bearer QQ=="
"Authorization": "Bearer QQ==",
}
# borrowed from apt-sync.py
def check_and_download(url: str, dst_file: Path, dst_tmp_file: Path, headers=ghcr_headers):
if dst_file.is_file(): return 2 # old file
def check_and_download(
url: str, dst_file: Path, dst_tmp_file: Path, headers=ghcr_headers
):
if dst_file.is_file():
return 2 # old file
try:
start = time.time()
with requests.get(url, stream=True, timeout=(5, 10), headers=headers) as r:
r.raise_for_status()
if 'last-modified' in r.headers:
if "last-modified" in r.headers:
remote_ts = parsedate_to_datetime(
r.headers['last-modified']).timestamp()
else: remote_ts = None
r.headers["last-modified"]
).timestamp()
else:
remote_ts = None
with dst_tmp_file.open('wb') as f:
with dst_tmp_file.open("wb") as f:
for chunk in r.iter_content(chunk_size=1024**2):
if time.time() - start > DOWNLOAD_TIMEOUT:
raise TimeoutError("Download timeout")
if not chunk: continue # filter out keep-alive new chunks
if not chunk:
continue # filter out keep-alive new chunks
f.write(chunk)
if remote_ts is not None:
@ -92,9 +108,11 @@ def check_and_download(url: str, dst_file: Path, dst_tmp_file: Path, headers=ghc
return 0
except BaseException as e:
print(e, flush=True)
if dst_tmp_file.is_file(): dst_tmp_file.unlink()
if dst_tmp_file.is_file():
dst_tmp_file.unlink()
return 1
if __name__ == "__main__":
# clean tmp file from previous sync
TMP_DIR = WORKING_DIR / ".tmp"

View File

@ -1,7 +1,6 @@
#!/usr/bin/env python3
import traceback
import os
import sys
import subprocess as sp
import tempfile
import argparse
@ -16,47 +15,50 @@ from pathlib import Path
from typing import List, Dict
import requests
REPO_SIZE_FILE = os.getenv('REPO_SIZE_FILE', '')
DOWNLOAD_TIMEOUT=int(os.getenv('DOWNLOAD_TIMEOUT', '1800'))
REPO_SIZE_FILE = os.getenv("REPO_SIZE_FILE", "")
DOWNLOAD_TIMEOUT = int(os.getenv("DOWNLOAD_TIMEOUT", "1800"))
REPO_STAT = {}
def calc_repo_size(path: Path):
dbfiles = path.glob('repodata/*primary.*')
dbfiles = path.glob("repodata/*primary.*")
with tempfile.NamedTemporaryFile() as tmp:
dec = None
dbfile = None
for db in dbfiles:
dbfile = db
suffixes = db.suffixes
if suffixes[-1] == '.bz2':
if suffixes[-1] == ".bz2":
dec = bz2.decompress
suffixes = suffixes[:-1]
elif suffixes[-1] == '.gz':
elif suffixes[-1] == ".gz":
dec = gzip.decompress
suffixes = suffixes[:-1]
elif suffixes[-1] in ('.sqlite', '.xml'):
elif suffixes[-1] in (".sqlite", ".xml"):
dec = lambda x: x
if dec is None:
print(f"Failed to read from {path}: {list(dbfiles)}", flush=True)
return
with db.open('rb') as f:
with db.open("rb") as f:
tmp.write(dec(f.read()))
tmp.flush()
if suffixes[-1] == '.sqlite':
if suffixes[-1] == ".sqlite":
conn = sqlite3.connect(tmp.name)
c = conn.cursor()
c.execute("select sum(size_package),count(1) from packages")
size, cnt = c.fetchone()
conn.close()
elif suffixes[-1] == '.xml':
elif suffixes[-1] == ".xml":
try:
tree = ET.parse(tmp.name)
root = tree.getroot()
assert root.tag.endswith('metadata')
assert root.tag.endswith("metadata")
cnt, size = 0, 0
for location in root.findall('./{http://linux.duke.edu/metadata/common}package/{http://linux.duke.edu/metadata/common}size'):
size += int(location.attrib['package'])
for location in root.findall(
"./{http://linux.duke.edu/metadata/common}package/{http://linux.duke.edu/metadata/common}size"
):
size += int(location.attrib["package"])
cnt += 1
except:
traceback.print_exc()
@ -69,23 +71,27 @@ def calc_repo_size(path: Path):
print(f" {cnt} packages, {size} bytes in total", flush=True)
global REPO_STAT
REPO_STAT[str(path)] = (size, cnt) if cnt > 0 else (0, 0) # size can be None
REPO_STAT[str(path)] = (size, cnt) if cnt > 0 else (0, 0) # size can be None
def check_and_download(url: str, dst_file: Path)->int:
def check_and_download(url: str, dst_file: Path) -> int:
try:
start = time.time()
with requests.get(url, stream=True, timeout=(5, 10)) as r:
r.raise_for_status()
if 'last-modified' in r.headers:
if "last-modified" in r.headers:
remote_ts = parsedate_to_datetime(
r.headers['last-modified']).timestamp()
else: remote_ts = None
r.headers["last-modified"]
).timestamp()
else:
remote_ts = None
with dst_file.open('wb') as f:
with dst_file.open("wb") as f:
for chunk in r.iter_content(chunk_size=1024**2):
if time.time() - start > DOWNLOAD_TIMEOUT:
raise TimeoutError("Download timeout")
if not chunk: continue # filter out keep-alive new chunks
if not chunk:
continue # filter out keep-alive new chunks
f.write(chunk)
if remote_ts is not None:
@ -93,13 +99,15 @@ def check_and_download(url: str, dst_file: Path)->int:
return 0
except BaseException as e:
print(e, flush=True)
if dst_file.is_file(): dst_file.unlink()
if dst_file.is_file():
dst_file.unlink()
return 1
def download_repodata(url: str, path: Path) -> int:
path = path / "repodata"
path.mkdir(exist_ok=True)
oldfiles = set(path.glob('*.*'))
oldfiles = set(path.glob("*.*"))
newfiles = set()
if check_and_download(url + "/repodata/repomd.xml", path / ".repomd.xml") != 0:
print(f"Failed to download the repomd.xml of {url}")
@ -107,64 +115,78 @@ def download_repodata(url: str, path: Path) -> int:
try:
tree = ET.parse(path / ".repomd.xml")
root = tree.getroot()
assert root.tag.endswith('repomd')
for location in root.findall('./{http://linux.duke.edu/metadata/repo}data/{http://linux.duke.edu/metadata/repo}location'):
href = location.attrib['href']
assert len(href) > 9 and href[:9] == 'repodata/'
fn = path / href[9:]
newfiles.add(fn)
if check_and_download(url + '/' + href, fn) != 0:
print(f"Failed to download the {href}")
return 1
assert root.tag.endswith("repomd")
for location in root.findall(
"./{http://linux.duke.edu/metadata/repo}data/{http://linux.duke.edu/metadata/repo}location"
):
href = location.attrib["href"]
assert len(href) > 9 and href[:9] == "repodata/"
fn = path / href[9:]
newfiles.add(fn)
if check_and_download(url + "/" + href, fn) != 0:
print(f"Failed to download the {href}")
return 1
except BaseException as e:
traceback.print_exc()
return 1
(path / ".repomd.xml").rename(path / "repomd.xml") # update the repomd.xml
(path / ".repomd.xml").rename(path / "repomd.xml") # update the repomd.xml
newfiles.add(path / "repomd.xml")
for i in (oldfiles - newfiles):
for i in oldfiles - newfiles:
print(f"Deleting old files: {i}")
i.unlink()
def check_args(prop: str, lst: List[str]):
for s in lst:
if len(s)==0 or ' ' in s:
if len(s) == 0 or " " in s:
raise ValueError(f"Invalid item in {prop}: {repr(s)}")
def substitute_vars(s: str, vardict: Dict[str, str]) -> str:
for key, val in vardict.items():
tpl = "@{"+key+"}"
tpl = "@{" + key + "}"
s = s.replace(tpl, val)
return s
def main():
parser = argparse.ArgumentParser()
parser.add_argument("base_url", type=str, help="base URL")
parser.add_argument("os_version", type=str, help="e.g. 7-8,9")
parser.add_argument("component", type=str, help="e.g. mysql56-community,mysql57-community")
parser.add_argument(
"component", type=str, help="e.g. mysql56-community,mysql57-community"
)
parser.add_argument("arch", type=str, help="e.g. x86_64,aarch64")
parser.add_argument("repo_name", type=str, help="e.g. @{comp}-el@{os_ver}")
parser.add_argument("working_dir", type=Path, help="working directory")
parser.add_argument("--download-repodata", action='store_true',
help='download repodata files instead of generating them')
parser.add_argument("--pass-arch-to-reposync", action='store_true',
help='''pass --arch to reposync to further filter packages by 'arch' field in metadata (NOT recommended, prone to missing packages in some repositories, e.g. mysql)''')
parser.add_argument(
"--download-repodata",
action="store_true",
help="download repodata files instead of generating them",
)
parser.add_argument(
"--pass-arch-to-reposync",
action="store_true",
help="""pass --arch to reposync to further filter packages by 'arch' field in metadata (NOT recommended, prone to missing packages in some repositories, e.g. mysql)""",
)
args = parser.parse_args()
os_list = []
for os_version in args.os_version.split(','):
if '-' in os_version and '-stream' not in os_version:
dash = os_version.index('-')
os_list = os_list + [ str(i) for i in range(
int(os_version[:dash]),
1+int(os_version[dash+1:])) ]
for os_version in args.os_version.split(","):
if "-" in os_version and "-stream" not in os_version:
dash = os_version.index("-")
os_list = os_list + [
str(i)
for i in range(int(os_version[:dash]), 1 + int(os_version[dash + 1 :]))
]
else:
os_list.append(os_version)
check_args("os_version", os_list)
component_list = args.component.split(',')
component_list = args.component.split(",")
check_args("component", component_list)
arch_list = args.arch.split(',')
arch_list = args.arch.split(",")
check_args("arch", arch_list)
failed = []
@ -175,16 +197,18 @@ def main():
for os in os_list:
for comp in component_list:
vardict = {
'arch': arch,
'os_ver': os,
'comp': comp,
"arch": arch,
"os_ver": os,
"comp": comp,
}
name = substitute_vars(args.repo_name, vardict)
url = substitute_vars(args.base_url, vardict)
try:
probe_url = url + ('' if url.endswith('/') else '/') + "repodata/repomd.xml"
r = requests.head(probe_url, timeout=(7,7))
probe_url = (
url + ("" if url.endswith("/") else "/") + "repodata/repomd.xml"
)
r = requests.head(probe_url, timeout=(7, 7))
if r.status_code < 400 or r.status_code == 403:
yield (name, url)
else:
@ -195,19 +219,23 @@ def main():
for arch in arch_list:
dest_dirs = []
conf = tempfile.NamedTemporaryFile("w", suffix=".conf")
conf.write('''
conf.write(
"""
[main]
keepcache=0
''')
"""
)
for name, url in combination_os_comp(arch):
conf.write(f'''
conf.write(
f"""
[{name}]
name={name}
baseurl={url}
repo_gpgcheck=0
gpgcheck=0
enabled=1
''')
"""
)
dst = (args.working_dir / name).absolute()
dst.mkdir(parents=True, exist_ok=True)
dest_dirs.append(dst)
@ -217,13 +245,18 @@ enabled=1
if len(dest_dirs) == 0:
print("Nothing to sync", flush=True)
failed.append(('', arch))
failed.append(("", arch))
continue
cmd_args = [
"dnf", "reposync",
"-c", conf.name,
"--delete", "-p", str(args.working_dir.absolute())]
"dnf",
"reposync",
"-c",
conf.name,
"--delete",
"-p",
str(args.working_dir.absolute()),
]
if args.pass_arch_to_reposync:
cmd_args += ["--arch", arch]
print(f"Launching dnf reposync with command: {cmd_args}", flush=True)
@ -237,7 +270,16 @@ enabled=1
if args.download_repodata:
download_repodata(url, path)
else:
cmd_args = ["createrepo_c", "--update", "-v", "-c", cache_dir, "-o", str(path), str(path)]
cmd_args = [
"createrepo_c",
"--update",
"-v",
"-c",
cache_dir,
"-o",
str(path),
str(path),
]
print(f"Launching createrepo with command: {cmd_args}", flush=True)
ret = sp.run(cmd_args)
calc_repo_size(path)
@ -250,5 +292,6 @@ enabled=1
total_size = sum([r[0] for r in REPO_STAT.values()])
fd.write(f"+{total_size}")
if __name__ == "__main__":
main()