mirror of
https://github.com/taoky/shadowmire.git
synced 2025-07-08 09:12:43 +00:00
Add generate pathcache as Step 3
This helps speed up check_and_update(), when --compare-size is NOT set.
This commit is contained in:
parent
7c963b00b1
commit
40c9edba50
@ -110,6 +110,8 @@ If you already have a PyPI repo, use `genlocal` first to generate a local db:
|
|||||||
> 1. `./shadowmire.py verify --sync-packages --remove-not-in-local --compare-size`: remove any local packages that were missing from upstream index (normally removed from PyPI), then download any missing metadata and packages. **This step is likely to take very long time, depending on your network and disk speed.**
|
> 1. `./shadowmire.py verify --sync-packages --remove-not-in-local --compare-size`: remove any local packages that were missing from upstream index (normally removed from PyPI), then download any missing metadata and packages. **This step is likely to take very long time, depending on your network and disk speed.**
|
||||||
> * Q: Why will there be packages that are in neither local db nor remote index?
|
> * Q: Why will there be packages that are in neither local db nor remote index?
|
||||||
> * A: They are packages without valid local metadata, and do not exist on PyPI anymore. These packages are typically downloaded a long time ago and removed from upstream, but they may still share some blobs with currently available packages. E.g. after name normalization of `Foo` to `foo`, they share all existings blobs, but `Foo` does not change any more.
|
> * A: They are packages without valid local metadata, and do not exist on PyPI anymore. These packages are typically downloaded a long time ago and removed from upstream, but they may still share some blobs with currently available packages. E.g. after name normalization of `Foo` to `foo`, they share all existings blobs, but `Foo` does not change any more.
|
||||||
|
> * Q: My HDD disk (array) is too, too, too slooooow, any method to speed up?
|
||||||
|
> * A: You could try remove `--compare-size` argument, at the cost of having a very small possible part of package file inconsistencies locally.
|
||||||
> 1. `./shadowmire.py genlocal`: generate local database again.
|
> 1. `./shadowmire.py genlocal`: generate local database again.
|
||||||
> 1. `./shadowmire.py sync --sync-packages`: synchronize new changes after verification.
|
> 1. `./shadowmire.py sync --sync-packages`: synchronize new changes after verification.
|
||||||
|
|
||||||
|
107
shadowmire.py
107
shadowmire.py
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
import sys
|
import sys
|
||||||
from types import FrameType
|
from types import FrameType
|
||||||
from typing import IO, Any, Callable, Generator, Literal, NoReturn, Optional, Set
|
from typing import IO, Any, Callable, Generator, Literal, NoReturn, Optional
|
||||||
import xmlrpc.client
|
import xmlrpc.client
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
import re
|
import re
|
||||||
@ -537,6 +537,7 @@ class SyncBase:
|
|||||||
package_names: list[str],
|
package_names: list[str],
|
||||||
prerelease_excludes: list[re.Pattern[str]],
|
prerelease_excludes: list[re.Pattern[str]],
|
||||||
json_files: set[str],
|
json_files: set[str],
|
||||||
|
packages_pathcache: set[str],
|
||||||
compare_size: bool,
|
compare_size: bool,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
def is_consistent(package_name: str) -> bool:
|
def is_consistent(package_name: str) -> bool:
|
||||||
@ -573,13 +574,16 @@ class SyncBase:
|
|||||||
# OK, check if all hrefs have corresponding files
|
# OK, check if all hrefs have corresponding files
|
||||||
if self.sync_packages:
|
if self.sync_packages:
|
||||||
for href, size in hrefsize_json:
|
for href, size in hrefsize_json:
|
||||||
dest = Path(normpath(package_simple_path / href))
|
dest_pathstr = normpath(package_simple_path / href)
|
||||||
try:
|
try:
|
||||||
dest_stat = dest.stat()
|
# Fast shortcut to avoid stat() it
|
||||||
except FileNotFoundError:
|
if dest_pathstr not in packages_pathcache:
|
||||||
logger.info("add %s as it's missing packages", package_name)
|
raise FileNotFoundError
|
||||||
return False
|
|
||||||
if compare_size and size != -1:
|
if compare_size and size != -1:
|
||||||
|
dest = Path(normpath(package_simple_path / href))
|
||||||
|
# So, do stat() for real only when we need to do so,
|
||||||
|
# have a size, and it really exists in pathcache.
|
||||||
|
dest_stat = dest.stat()
|
||||||
dest_size = dest_stat.st_size
|
dest_size = dest_stat.st_size
|
||||||
if dest_size != size:
|
if dest_size != size:
|
||||||
logger.info(
|
logger.info(
|
||||||
@ -589,6 +593,10 @@ class SyncBase:
|
|||||||
size,
|
size,
|
||||||
)
|
)
|
||||||
return False
|
return False
|
||||||
|
except FileNotFoundError:
|
||||||
|
logger.info("add %s as it's missing packages", package_name)
|
||||||
|
return False
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
to_update = []
|
to_update = []
|
||||||
@ -1229,18 +1237,56 @@ def verify(
|
|||||||
# After some removal, local_names is changed.
|
# After some removal, local_names is changed.
|
||||||
local_names = set(local_db.keys())
|
local_names = set(local_db.keys())
|
||||||
|
|
||||||
|
logger.info("====== Step 3. Caching packages/ dirtree in memory for Step 4 & 5.")
|
||||||
|
packages_pathcache: set[str] = set()
|
||||||
|
with ThreadPoolExecutor(max_workers=IOWORKERS) as executor:
|
||||||
|
|
||||||
|
def packages_iterate(first_dirname: str, position: int) -> list[str]:
|
||||||
|
with tqdm(
|
||||||
|
desc=f"Iterating packages/{first_dirname}/*/*/*", position=position
|
||||||
|
) as pb:
|
||||||
|
res = []
|
||||||
|
for d1 in fast_iterdir(basedir / "packages" / first_dirname, "dir"):
|
||||||
|
for d2 in fast_iterdir(d1.path, "dir"):
|
||||||
|
for file in fast_iterdir(d2.path, "file"):
|
||||||
|
pb.update(1)
|
||||||
|
res.append(file.path)
|
||||||
|
return res
|
||||||
|
|
||||||
|
futures = {
|
||||||
|
executor.submit(packages_iterate, first_dir.name, idx % IOWORKERS): first_dir.name # type: ignore
|
||||||
|
for idx, first_dir in enumerate(fast_iterdir((basedir / "packages"), "dir"))
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
for future in as_completed(futures):
|
||||||
|
sname = futures[future]
|
||||||
|
try:
|
||||||
|
for p in future.result():
|
||||||
|
packages_pathcache.add(p)
|
||||||
|
except Exception as e:
|
||||||
|
if isinstance(e, (KeyboardInterrupt)):
|
||||||
|
raise
|
||||||
|
logger.warning("%s generated an exception", sname, exc_info=True)
|
||||||
|
success = False
|
||||||
|
except (ExitProgramException, KeyboardInterrupt):
|
||||||
|
exit_with_futures(futures)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"====== Step 3. Make sure all local indexes are valid, and (if --sync-packages) have valid local package files ======"
|
"====== Step 4. Make sure all local indexes are valid, and (if --sync-packages) have valid local package files ======"
|
||||||
)
|
)
|
||||||
success = syncer.check_and_update(
|
success = syncer.check_and_update(
|
||||||
list(local_names), prerelease_excludes, json_files, compare_size
|
list(local_names),
|
||||||
|
prerelease_excludes,
|
||||||
|
json_files,
|
||||||
|
packages_pathcache,
|
||||||
|
compare_size,
|
||||||
)
|
)
|
||||||
syncer.finalize()
|
syncer.finalize()
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"====== Step 4. Remove any unreferenced files in `packages` folder ======"
|
"====== Step 5. Remove any unreferenced files in `packages` folder ======"
|
||||||
)
|
)
|
||||||
ref_set: Set[str] = set()
|
ref_set: set[str] = set()
|
||||||
with ThreadPoolExecutor(max_workers=IOWORKERS) as executor:
|
with ThreadPoolExecutor(max_workers=IOWORKERS) as executor:
|
||||||
# Part 1: iterate simple/
|
# Part 1: iterate simple/
|
||||||
def iterate_simple(sname: str) -> list[str]:
|
def iterate_simple(sname: str) -> list[str]:
|
||||||
@ -1257,8 +1303,10 @@ def verify(
|
|||||||
nps.append(np)
|
nps.append(np)
|
||||||
return nps
|
return nps
|
||||||
|
|
||||||
|
# MyPy does not enjoy same variable name with different types, even when --allow-redefinition
|
||||||
|
# Ignore here to make mypy happy
|
||||||
futures = {
|
futures = {
|
||||||
executor.submit(iterate_simple, sname): sname for sname in simple_dirs
|
executor.submit(iterate_simple, sname): sname for sname in simple_dirs # type: ignore
|
||||||
}
|
}
|
||||||
try:
|
try:
|
||||||
for future in tqdm(
|
for future in tqdm(
|
||||||
@ -1279,38 +1327,11 @@ def verify(
|
|||||||
except (ExitProgramException, KeyboardInterrupt):
|
except (ExitProgramException, KeyboardInterrupt):
|
||||||
exit_with_futures(futures)
|
exit_with_futures(futures)
|
||||||
|
|
||||||
# Part 2: iterate packages
|
# Part 2: handling packages
|
||||||
def unlink_not_in_set(first_dirname: str, position: int) -> None:
|
for path in tqdm(packages_pathcache, desc="Iterating path cache"):
|
||||||
with tqdm(
|
if path not in ref_set:
|
||||||
desc=f"Iterating packages/{first_dirname}/*/*/*", position=position
|
logger.info("removing unreferenced file %s", path)
|
||||||
) as pb:
|
Path(path).unlink()
|
||||||
for d1 in fast_iterdir(basedir / "packages" / first_dirname, "dir"):
|
|
||||||
for d2 in fast_iterdir(d1.path, "dir"):
|
|
||||||
for file in fast_iterdir(d2.path, "file"):
|
|
||||||
pb.update(1)
|
|
||||||
logger.debug("find file %s", file)
|
|
||||||
if file.path not in ref_set:
|
|
||||||
logger.info("removing unreferenced file %s", file.path)
|
|
||||||
Path(file.path).unlink()
|
|
||||||
|
|
||||||
# MyPy does not enjoy same variable name with different types, even when --allow-redefinition
|
|
||||||
# Ignore here to make mypy happy
|
|
||||||
futures = {
|
|
||||||
executor.submit(unlink_not_in_set, first_dir.name, idx % IOWORKERS): first_dir.name # type: ignore
|
|
||||||
for idx, first_dir in enumerate(fast_iterdir((basedir / "packages"), "dir"))
|
|
||||||
}
|
|
||||||
try:
|
|
||||||
for future in as_completed(futures):
|
|
||||||
sname = futures[future]
|
|
||||||
try:
|
|
||||||
future.result()
|
|
||||||
except Exception as e:
|
|
||||||
if isinstance(e, (KeyboardInterrupt)):
|
|
||||||
raise
|
|
||||||
logger.warning("%s generated an exception", sname, exc_info=True)
|
|
||||||
success = False
|
|
||||||
except (ExitProgramException, KeyboardInterrupt):
|
|
||||||
exit_with_futures(futures)
|
|
||||||
|
|
||||||
logger.info("Verification finished. Success: %s", success)
|
logger.info("Verification finished. Success: %s", success)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user