Add generate pathcache as Step 3

This helps speed up check_and_update(), when --compare-size is NOT set.
This commit is contained in:
taoky 2024-09-04 00:18:08 +08:00
parent 7c963b00b1
commit 40c9edba50
2 changed files with 74 additions and 51 deletions

View File

@ -110,6 +110,8 @@ If you already have a PyPI repo, use `genlocal` first to generate a local db:
> 1. `./shadowmire.py verify --sync-packages --remove-not-in-local --compare-size`: remove any local packages that were missing from upstream index (normally removed from PyPI), then download any missing metadata and packages. **This step is likely to take very long time, depending on your network and disk speed.**
> * Q: Why will there be packages that are in neither local db nor remote index?
> * A: They are packages without valid local metadata, and do not exist on PyPI anymore. These packages are typically downloaded a long time ago and removed from upstream, but they may still share some blobs with currently available packages. E.g. after name normalization of `Foo` to `foo`, they share all existings blobs, but `Foo` does not change any more.
> * Q: My HDD disk (array) is too, too, too slooooow, any method to speed up?
> * A: You could try remove `--compare-size` argument, at the cost of having a very small possible part of package file inconsistencies locally.
> 1. `./shadowmire.py genlocal`: generate local database again.
> 1. `./shadowmire.py sync --sync-packages`: synchronize new changes after verification.

View File

@ -2,7 +2,7 @@
import sys
from types import FrameType
from typing import IO, Any, Callable, Generator, Literal, NoReturn, Optional, Set
from typing import IO, Any, Callable, Generator, Literal, NoReturn, Optional
import xmlrpc.client
from dataclasses import dataclass
import re
@ -537,6 +537,7 @@ class SyncBase:
package_names: list[str],
prerelease_excludes: list[re.Pattern[str]],
json_files: set[str],
packages_pathcache: set[str],
compare_size: bool,
) -> bool:
def is_consistent(package_name: str) -> bool:
@ -573,13 +574,16 @@ class SyncBase:
# OK, check if all hrefs have corresponding files
if self.sync_packages:
for href, size in hrefsize_json:
dest = Path(normpath(package_simple_path / href))
dest_pathstr = normpath(package_simple_path / href)
try:
dest_stat = dest.stat()
except FileNotFoundError:
logger.info("add %s as it's missing packages", package_name)
return False
# Fast shortcut to avoid stat() it
if dest_pathstr not in packages_pathcache:
raise FileNotFoundError
if compare_size and size != -1:
dest = Path(normpath(package_simple_path / href))
# So, do stat() for real only when we need to do so,
# have a size, and it really exists in pathcache.
dest_stat = dest.stat()
dest_size = dest_stat.st_size
if dest_size != size:
logger.info(
@ -589,6 +593,10 @@ class SyncBase:
size,
)
return False
except FileNotFoundError:
logger.info("add %s as it's missing packages", package_name)
return False
return True
to_update = []
@ -1229,18 +1237,56 @@ def verify(
# After some removal, local_names is changed.
local_names = set(local_db.keys())
logger.info("====== Step 3. Caching packages/ dirtree in memory for Step 4 & 5.")
packages_pathcache: set[str] = set()
with ThreadPoolExecutor(max_workers=IOWORKERS) as executor:
def packages_iterate(first_dirname: str, position: int) -> list[str]:
with tqdm(
desc=f"Iterating packages/{first_dirname}/*/*/*", position=position
) as pb:
res = []
for d1 in fast_iterdir(basedir / "packages" / first_dirname, "dir"):
for d2 in fast_iterdir(d1.path, "dir"):
for file in fast_iterdir(d2.path, "file"):
pb.update(1)
res.append(file.path)
return res
futures = {
executor.submit(packages_iterate, first_dir.name, idx % IOWORKERS): first_dir.name # type: ignore
for idx, first_dir in enumerate(fast_iterdir((basedir / "packages"), "dir"))
}
try:
for future in as_completed(futures):
sname = futures[future]
try:
for p in future.result():
packages_pathcache.add(p)
except Exception as e:
if isinstance(e, (KeyboardInterrupt)):
raise
logger.warning("%s generated an exception", sname, exc_info=True)
success = False
except (ExitProgramException, KeyboardInterrupt):
exit_with_futures(futures)
logger.info(
"====== Step 3. Make sure all local indexes are valid, and (if --sync-packages) have valid local package files ======"
"====== Step 4. Make sure all local indexes are valid, and (if --sync-packages) have valid local package files ======"
)
success = syncer.check_and_update(
list(local_names), prerelease_excludes, json_files, compare_size
list(local_names),
prerelease_excludes,
json_files,
packages_pathcache,
compare_size,
)
syncer.finalize()
logger.info(
"====== Step 4. Remove any unreferenced files in `packages` folder ======"
"====== Step 5. Remove any unreferenced files in `packages` folder ======"
)
ref_set: Set[str] = set()
ref_set: set[str] = set()
with ThreadPoolExecutor(max_workers=IOWORKERS) as executor:
# Part 1: iterate simple/
def iterate_simple(sname: str) -> list[str]:
@ -1257,8 +1303,10 @@ def verify(
nps.append(np)
return nps
# MyPy does not enjoy same variable name with different types, even when --allow-redefinition
# Ignore here to make mypy happy
futures = {
executor.submit(iterate_simple, sname): sname for sname in simple_dirs
executor.submit(iterate_simple, sname): sname for sname in simple_dirs # type: ignore
}
try:
for future in tqdm(
@ -1279,38 +1327,11 @@ def verify(
except (ExitProgramException, KeyboardInterrupt):
exit_with_futures(futures)
# Part 2: iterate packages
def unlink_not_in_set(first_dirname: str, position: int) -> None:
with tqdm(
desc=f"Iterating packages/{first_dirname}/*/*/*", position=position
) as pb:
for d1 in fast_iterdir(basedir / "packages" / first_dirname, "dir"):
for d2 in fast_iterdir(d1.path, "dir"):
for file in fast_iterdir(d2.path, "file"):
pb.update(1)
logger.debug("find file %s", file)
if file.path not in ref_set:
logger.info("removing unreferenced file %s", file.path)
Path(file.path).unlink()
# MyPy does not enjoy same variable name with different types, even when --allow-redefinition
# Ignore here to make mypy happy
futures = {
executor.submit(unlink_not_in_set, first_dir.name, idx % IOWORKERS): first_dir.name # type: ignore
for idx, first_dir in enumerate(fast_iterdir((basedir / "packages"), "dir"))
}
try:
for future in as_completed(futures):
sname = futures[future]
try:
future.result()
except Exception as e:
if isinstance(e, (KeyboardInterrupt)):
raise
logger.warning("%s generated an exception", sname, exc_info=True)
success = False
except (ExitProgramException, KeyboardInterrupt):
exit_with_futures(futures)
# Part 2: handling packages
for path in tqdm(packages_pathcache, desc="Iterating path cache"):
if path not in ref_set:
logger.info("removing unreferenced file %s", path)
Path(path).unlink()
logger.info("Verification finished. Success: %s", success)