Adjust verify behavior

Verifying is heavily network-bound if we need to update EVERY package, so let's just check if local is consist instead of making sure everything is up-to-date.
This commit is contained in:
taoky 2024-08-05 22:04:33 +08:00
parent 804fba55cb
commit 2aa8c9ae97
2 changed files with 55 additions and 10 deletions

View File

@ -71,7 +71,14 @@ If you already have a pypi repo, use `genlocal` first to generate a local db:
./shadowmire.py genlocal ./shadowmire.py genlocal
``` ```
Verify command could be used if you believe that something is wrong. It would remove packages NOT in local db, update all local packages, and delete unreferenced files in `packages` folder: Verify command could be used if you believe that something is wrong (inconsistent). It would:
1. remove packages NOT in local db
2. remove packages NOT in remote (with consideration of `--exclude`)
3. make sure all local indexes are valid, and (if --sync-packages) have valid local package files
(`--prerelease-exclude` would be ignored)
4. delete unreferenced files in `packages` folder
```shell ```shell
./shadowmire.py verify --sync-packages ./shadowmire.py verify --sync-packages

View File

@ -418,6 +418,26 @@ class SyncBase:
def fetch_remote_versions(self) -> dict[str, int]: def fetch_remote_versions(self) -> dict[str, int]:
raise NotImplementedError raise NotImplementedError
def check_and_update(self, package_names: list[str]) -> None:
to_update = []
for package_name in tqdm(package_names, desc="Checking consistency"):
package_simple_path = self.basedir / "simple" / package_name
hrefs = get_existing_hrefs(package_simple_path)
if not hrefs:
to_update.append(package_name)
continue
# OK, check if all hrefs have corresponding files
if self.sync_packages:
should_update = False
for href in hrefs:
dest = (package_simple_path / href).resolve()
if not dest.exists():
should_update = True
break
if should_update:
to_update.append(package_name)
self.parallel_update(to_update, [])
def parallel_update( def parallel_update(
self, package_names: list, prerelease_excludes: list[re.Pattern[str]] self, package_names: list, prerelease_excludes: list[re.Pattern[str]]
) -> None: ) -> None:
@ -432,7 +452,7 @@ class SyncBase:
for idx, package_name in enumerate(package_names) for idx, package_name in enumerate(package_names)
} }
try: try:
for future in tqdm(as_completed(futures), total=len(package_names)): for future in tqdm(as_completed(futures), total=len(package_names), desc="Updating"):
idx, package_name = futures[future] idx, package_name = futures[future]
try: try:
serial = future.result() serial = future.result()
@ -522,7 +542,11 @@ def download(session: requests.Session, url: str, dest: Path) -> tuple[bool, int
logger.warning("download %s failed with exception", exc_info=True) logger.warning("download %s failed with exception", exc_info=True)
return False, -1 return False, -1
if resp.status_code >= 400: if resp.status_code >= 400:
logger.warning("download %s failed with status %s, skipping this package", url, resp.status_code) logger.warning(
"download %s failed with status %s, skipping this package",
url,
resp.status_code,
)
return False, resp.status_code return False, resp.status_code
with overwrite(dest, "wb") as f: with overwrite(dest, "wb") as f:
f.write(resp.content) f.write(resp.content)
@ -557,7 +581,9 @@ class SyncPyPI(SyncBase):
meta = self.pypi.get_package_metadata(package_name) meta = self.pypi.get_package_metadata(package_name)
logger.debug("%s meta: %s", package_name, meta) logger.debug("%s meta: %s", package_name, meta)
except PackageNotFoundError: except PackageNotFoundError:
logger.warning("%s missing from upstream, skip and ignore in the future.", package_name) logger.warning(
"%s missing from upstream, skip and ignore in the future.", package_name
)
# try remove it locally, if it does not exist upstream # try remove it locally, if it does not exist upstream
self.do_remove(package_name, write_db=False) self.do_remove(package_name, write_db=False)
if not write_db: if not write_db:
@ -655,13 +681,17 @@ class SyncPlainHTTP(SyncBase):
for filename in ("index.html", "index.v1_html", "index.v1_json"): for filename in ("index.html", "index.v1_html", "index.v1_json"):
file_url = urljoin(self.upstream, f"/simple/{package_name}/{filename}") file_url = urljoin(self.upstream, f"/simple/{package_name}/{filename}")
# Don't overwrite existing index first! # Don't overwrite existing index first!
success, code = download(self.session, file_url, package_simple_path / (filename + ".new")) success, code = download(
self.session, file_url, package_simple_path / (filename + ".new")
)
if not success: if not success:
if filename != "index.html": if filename != "index.html":
logger.warning("index file %s fails", file_url) logger.warning("index file %s fails", file_url)
continue continue
else: else:
logger.error("critical index file %s fails. Stop with this.", file_url) logger.error(
"critical index file %s fails. Stop with this.", file_url
)
if code == 404: if code == 404:
self.do_remove(package_name, write_db=False) self.do_remove(package_name, write_db=False)
# We don't return -1 here, as shadowmire upstream would fix this inconsistency next time syncing. # We don't return -1 here, as shadowmire upstream would fix this inconsistency next time syncing.
@ -851,15 +881,23 @@ def verify(
basedir: Path = ctx.obj["basedir"] basedir: Path = ctx.obj["basedir"]
local_db: LocalVersionKV = ctx.obj["local_db"] local_db: LocalVersionKV = ctx.obj["local_db"]
excludes = exclude_to_excludes(exclude) excludes = exclude_to_excludes(exclude)
prerelease_excludes = exclude_to_excludes(prerelease_exclude) # prerelease_excludes = exclude_to_excludes(prerelease_exclude)
syncer = get_syncer(basedir, local_db, sync_packages, shadowmire_upstream) syncer = get_syncer(basedir, local_db, sync_packages, shadowmire_upstream)
# 1. remove packages NOT in local db
local_names = set(local_db.keys()) local_names = set(local_db.keys())
simple_dirs = set([i.name for i in (basedir / "simple").iterdir() if i.is_dir()]) simple_dirs = set([i.name for i in (basedir / "simple").iterdir() if i.is_dir()])
for package_name in simple_dirs - local_names: for package_name in simple_dirs - local_names:
syncer.do_remove(package_name) syncer.do_remove(package_name)
syncer.parallel_update(list(local_names), prerelease_excludes) # 2. remove packages NOT in remote
local = local_db.dump(skip_invalid=False)
plan = syncer.determine_sync_plan(local, excludes)
for package_name in plan.remove:
# We only take the plan.remove part here
syncer.do_remove(package_name)
# 3. make sure all local indexes are valid, and (if --sync-packages) have valid local package files
syncer.check_and_update(list(local_names))
syncer.finalize() syncer.finalize()
# clean up unreferenced package files # 4. delete unreferenced files in `packages` folder
ref_set = set() ref_set = set()
for sname in simple_dirs: for sname in simple_dirs:
sd = basedir / "simple" / sname sd = basedir / "simple" / sname