mirror of
https://github.com/tuna/tunasync-scripts.git
synced 2025-04-19 03:32:43 +00:00
1496 lines
53 KiB
Python
Executable File
1496 lines
53 KiB
Python
Executable File
#!/usr/bin/env python
|
|
|
|
import sys
|
|
from types import FrameType
|
|
from typing import IO, Any, Callable, Generator, Literal, NoReturn, Optional
|
|
import xmlrpc.client
|
|
from dataclasses import dataclass
|
|
import re
|
|
import json
|
|
from urllib.parse import urljoin, urlparse, urlunparse, unquote
|
|
from pathlib import Path
|
|
from html.parser import HTMLParser
|
|
import logging
|
|
import html
|
|
import os
|
|
from os.path import (
|
|
normpath,
|
|
) # fast path computation, instead of accessing real files like pathlib
|
|
from contextlib import contextmanager
|
|
import sqlite3
|
|
from concurrent.futures import Future, ThreadPoolExecutor, as_completed
|
|
import signal
|
|
import tomllib
|
|
from copy import deepcopy
|
|
import functools
|
|
from http.client import HTTPConnection
|
|
|
|
import requests
|
|
import click
|
|
from tqdm import tqdm
|
|
from requests.adapters import HTTPAdapter, Retry
|
|
|
|
LOG_FORMAT = "%(asctime)s %(levelname)s: %(message)s (%(filename)s:%(lineno)d)"
|
|
logger = logging.getLogger("shadowmire")
|
|
|
|
|
|
USER_AGENT = "Shadowmire (https://github.com/taoky/shadowmire)"
|
|
|
|
# Note that it's suggested to use only 3 workers for PyPI.
|
|
WORKERS = int(os.environ.get("SHADOWMIRE_WORKERS", "3"))
|
|
# Use threads to parallelize verification local IO
|
|
IOWORKERS = int(os.environ.get("SHADOWMIRE_IOWORKERS", "2"))
|
|
# A safety net -- to avoid upstream issues casuing too many packages removed when determinating sync plan.
|
|
MAX_DELETION = int(os.environ.get("SHADOWMIRE_MAX_DELETION", "50000"))
|
|
# Sometimes PyPI is not consistent -- new packages could not be fetched. This option tries to avoid permanently mark that kind of package as nonexist.
|
|
IGNORE_THRESHOLD = int(os.environ.get("SHADOWMIRE_IGNORE_THRESHOLD", "1024"))
|
|
|
|
# https://github.com/pypa/bandersnatch/blob/a05af547f8d1958217ef0dc0028890b1839e6116/src/bandersnatch_filter_plugins/prerelease_name.py#L18C1-L23C6
|
|
PRERELEASE_PATTERNS = (
|
|
re.compile(r".+rc\d+$"),
|
|
re.compile(r".+a(lpha)?\d+$"),
|
|
re.compile(r".+b(eta)?\d+$"),
|
|
re.compile(r".+dev\d+$"),
|
|
)
|
|
|
|
|
|
class PackageNotFoundError(Exception):
|
|
pass
|
|
|
|
|
|
class ExitProgramException(Exception):
|
|
pass
|
|
|
|
|
|
def exit_handler(signum: int, frame: Optional[FrameType]) -> None:
|
|
raise ExitProgramException
|
|
|
|
|
|
signal.signal(signal.SIGTERM, exit_handler)
|
|
|
|
|
|
def exit_with_futures(futures: dict[Future[Any], Any]) -> NoReturn:
|
|
logger.info("Exiting...")
|
|
for future in futures:
|
|
future.cancel()
|
|
sys.exit(1)
|
|
|
|
|
|
class LocalVersionKV:
|
|
"""
|
|
A key-value database wrapper over sqlite3.
|
|
|
|
As it would have consistency issue if it's writing while downstream is downloading the database.
|
|
An extra "jsonpath" is used, to store kv results when necessary.
|
|
"""
|
|
|
|
def __init__(self, dbpath: Path, jsonpath: Path) -> None:
|
|
self.conn = sqlite3.connect(dbpath)
|
|
self.jsonpath = jsonpath
|
|
cur = self.conn.cursor()
|
|
cur.execute(
|
|
"CREATE TABLE IF NOT EXISTS local(key TEXT PRIMARY KEY, value INT NOT NULL)"
|
|
)
|
|
self.conn.commit()
|
|
|
|
def get(self, key: str) -> Optional[int]:
|
|
cur = self.conn.cursor()
|
|
res = cur.execute("SELECT value FROM local WHERE key = ?", (key,))
|
|
row = res.fetchone()
|
|
return row[0] if row else None
|
|
|
|
INSERT_SQL = "INSERT INTO local (key, value) VALUES (?, ?) ON CONFLICT(key) DO UPDATE SET value=excluded.value"
|
|
|
|
def set(self, key: str, value: int) -> None:
|
|
cur = self.conn.cursor()
|
|
cur.execute(self.INSERT_SQL, (key, value))
|
|
self.conn.commit()
|
|
|
|
def batch_set(self, d: dict[str, int]) -> None:
|
|
cur = self.conn.cursor()
|
|
kvs = list(d.items())
|
|
cur.executemany(self.INSERT_SQL, kvs)
|
|
self.conn.commit()
|
|
|
|
def remove(self, key: str) -> None:
|
|
cur = self.conn.cursor()
|
|
cur.execute("DELETE FROM local WHERE key = ?", (key,))
|
|
self.conn.commit()
|
|
|
|
def remove_invalid(self) -> int:
|
|
cur = self.conn.cursor()
|
|
cur.execute("DELETE FROM local WHERE value = -1")
|
|
rowcnt = cur.rowcount
|
|
self.conn.commit()
|
|
return rowcnt
|
|
|
|
def nuke(self, commit: bool = True) -> None:
|
|
cur = self.conn.cursor()
|
|
cur.execute("DELETE FROM local")
|
|
if commit:
|
|
self.conn.commit()
|
|
|
|
def keys(self, skip_invalid: bool = True) -> list[str]:
|
|
cur = self.conn.cursor()
|
|
if skip_invalid:
|
|
res = cur.execute("SELECT key FROM local WHERE value != -1")
|
|
else:
|
|
res = cur.execute("SELECT key FROM local")
|
|
rows = res.fetchall()
|
|
return [row[0] for row in rows]
|
|
|
|
def dump(self, skip_invalid: bool = True) -> dict[str, int]:
|
|
cur = self.conn.cursor()
|
|
if skip_invalid:
|
|
res = cur.execute("SELECT key, value FROM local WHERE value != -1")
|
|
else:
|
|
res = cur.execute("SELECT key, value FROM local")
|
|
rows = res.fetchall()
|
|
return {row[0]: row[1] for row in rows}
|
|
|
|
def dump_json(self, skip_invalid: bool = True) -> None:
|
|
res = self.dump(skip_invalid)
|
|
with overwrite(self.jsonpath) as f:
|
|
json.dump(res, f, indent=2)
|
|
|
|
|
|
@contextmanager
|
|
def overwrite(
|
|
file_path: Path, mode: str = "w", tmp_suffix: str = ".tmp"
|
|
) -> Generator[IO[Any], None, None]:
|
|
tmp_path = file_path.parent / (file_path.name + tmp_suffix)
|
|
try:
|
|
with open(tmp_path, mode) as tmp_file:
|
|
yield tmp_file
|
|
tmp_path.rename(file_path)
|
|
except Exception:
|
|
# well, just keep the tmp_path in error case.
|
|
raise
|
|
|
|
|
|
def fast_readall(file_path: Path) -> bytes:
|
|
"""
|
|
Save some extra read(), lseek() and ioctl().
|
|
"""
|
|
fd = os.open(file_path, os.O_RDONLY)
|
|
if fd < 0:
|
|
raise FileNotFoundError(file_path)
|
|
try:
|
|
contents = os.read(fd, file_path.stat().st_size)
|
|
return contents
|
|
finally:
|
|
os.close(fd)
|
|
|
|
|
|
def normalize(name: str) -> str:
|
|
"""
|
|
See https://peps.python.org/pep-0503/#normalized-names
|
|
"""
|
|
return re.sub(r"[-_.]+", "-", name).lower()
|
|
|
|
|
|
def remove_dir_with_files(directory: Path) -> None:
|
|
"""
|
|
Remove dir in a safer (non-recursive) way, which means that the directory should have no child directories.
|
|
"""
|
|
if not directory.exists():
|
|
return
|
|
assert directory.is_dir()
|
|
for item in directory.iterdir():
|
|
item.unlink()
|
|
directory.rmdir()
|
|
logger.info("Removed dir %s", directory)
|
|
|
|
|
|
def fast_iterdir(
|
|
directory: Path | str, filter_type: Literal["dir", "file"]
|
|
) -> Generator[os.DirEntry[str], Any, None]:
|
|
"""
|
|
iterdir() in pathlib would ignore file type information from getdents64(),
|
|
which is not acceptable when you have millions of files in one directory,
|
|
and you need to filter out all files/directories.
|
|
"""
|
|
assert filter_type in ["dir", "file"]
|
|
for item in os.scandir(directory):
|
|
if filter_type == "dir" and item.is_dir():
|
|
yield item
|
|
elif filter_type == "file" and item.is_file():
|
|
yield item
|
|
|
|
|
|
def get_package_urls_from_index_html(html_path: Path) -> list[str]:
|
|
"""
|
|
Get all <a> href (fragments removed) from given simple/<package>/index.html contents
|
|
"""
|
|
|
|
class ATagHTMLParser(HTMLParser):
|
|
def __init__(self) -> None:
|
|
super().__init__()
|
|
self.hrefs: list[Optional[str]] = []
|
|
|
|
def handle_starttag(
|
|
self, tag: str, attrs: list[tuple[str, str | None]]
|
|
) -> None:
|
|
if tag == "a":
|
|
for attr in attrs:
|
|
if attr[0] == "href":
|
|
self.hrefs.append(attr[1])
|
|
|
|
p = ATagHTMLParser()
|
|
contents = fast_readall(html_path).decode()
|
|
p.feed(contents)
|
|
|
|
ret = []
|
|
for href in p.hrefs:
|
|
if href:
|
|
parsed_url = urlparse(href)
|
|
clean_url = urlunparse(parsed_url._replace(fragment=""))
|
|
ret.append(clean_url)
|
|
return ret
|
|
|
|
|
|
def get_package_urls_from_index_json(json_path: Path) -> list[str]:
|
|
"""
|
|
Get all urls from given simple/<package>/index.v1_json contents
|
|
"""
|
|
contents = fast_readall(json_path)
|
|
contents_dict = json.loads(contents)
|
|
urls = [i["url"] for i in contents_dict["files"]]
|
|
return urls
|
|
|
|
|
|
def get_package_urls_size_from_index_json(json_path: Path) -> list[tuple[str, int]]:
|
|
"""
|
|
Get all urls and size from given simple/<package>/index.v1_json contents
|
|
|
|
If size is not available, returns size as -1
|
|
"""
|
|
contents = fast_readall(json_path)
|
|
contents_dict = json.loads(contents)
|
|
ret = [(i["url"], i.get("size", -1)) for i in contents_dict["files"]]
|
|
return ret
|
|
|
|
|
|
def get_existing_hrefs(package_simple_path: Path) -> Optional[list[str]]:
|
|
"""
|
|
There exists packages that have no release files, so when it encounters errors it would return None,
|
|
otherwise empty list or list with hrefs.
|
|
|
|
Priority: index.v1_json -> index.html
|
|
"""
|
|
json_file = package_simple_path / "index.v1_json"
|
|
html_file = package_simple_path / "index.html"
|
|
try:
|
|
return get_package_urls_from_index_json(json_file)
|
|
except FileNotFoundError:
|
|
try:
|
|
return get_package_urls_from_index_html(html_file)
|
|
except FileNotFoundError:
|
|
return None
|
|
|
|
|
|
class CustomXMLRPCTransport(xmlrpc.client.Transport):
|
|
"""
|
|
Set user-agent for xmlrpc.client
|
|
"""
|
|
|
|
user_agent = USER_AGENT
|
|
|
|
def make_connection(self, host: tuple[str, dict[str, str]] | str) -> HTTPConnection:
|
|
conn = super().make_connection(host)
|
|
if conn.timeout is None:
|
|
# 2 min timeout
|
|
conn.timeout = 120
|
|
return conn
|
|
|
|
|
|
def create_requests_session() -> requests.Session:
|
|
s = requests.Session()
|
|
# hardcode 1min timeout for connect & read for now
|
|
# https://requests.readthedocs.io/en/latest/user/advanced/#timeouts
|
|
# A hack to overwrite get() method
|
|
s.get_orig, s.get = s.get, functools.partial(s.get, timeout=(60, 60)) # type: ignore
|
|
retries = Retry(total=3, backoff_factor=0.1)
|
|
s.mount("http://", HTTPAdapter(max_retries=retries))
|
|
s.mount("https://", HTTPAdapter(max_retries=retries))
|
|
s.headers.update({"User-Agent": USER_AGENT})
|
|
return s
|
|
|
|
|
|
class PyPI:
|
|
"""
|
|
Upstream which implements full PyPI APIs
|
|
"""
|
|
|
|
host = "https://pypi.org"
|
|
# Let's assume that only sha256 exists...
|
|
digest_name = "sha256"
|
|
|
|
def __init__(self) -> None:
|
|
self.xmlrpc_client = xmlrpc.client.ServerProxy(
|
|
urljoin(self.host, "pypi"), transport=CustomXMLRPCTransport()
|
|
)
|
|
self.session = create_requests_session()
|
|
|
|
def list_packages_with_serial(self, do_normalize: bool = True) -> dict[str, int]:
|
|
logger.info(
|
|
"Calling list_packages_with_serial() RPC, this requires some time..."
|
|
)
|
|
ret: dict[str, int] = self.xmlrpc_client.list_packages_with_serial() # type: ignore
|
|
if do_normalize:
|
|
for key in list(ret.keys()):
|
|
normalized_key = normalize(key)
|
|
if normalized_key == key:
|
|
continue
|
|
ret[normalized_key] = ret[key]
|
|
del ret[key]
|
|
return ret
|
|
|
|
def changelog_last_serial(self) -> int:
|
|
return self.xmlrpc_client.changelog_last_serial() # type: ignore
|
|
|
|
def get_package_metadata(self, package_name: str) -> dict:
|
|
req = self.session.get(urljoin(self.host, f"pypi/{package_name}/json"))
|
|
if req.status_code == 404:
|
|
raise PackageNotFoundError
|
|
return req.json() # type: ignore
|
|
|
|
@staticmethod
|
|
def get_release_files_from_meta(package_meta: dict) -> list[dict]:
|
|
release_files = []
|
|
for release in package_meta["releases"].values():
|
|
release_files.extend(release)
|
|
release_files.sort(key=lambda x: x["filename"])
|
|
return release_files
|
|
|
|
@staticmethod
|
|
def file_url_to_local_url(url: str) -> str:
|
|
"""
|
|
This function should NOT be used to construct a local Path!
|
|
"""
|
|
parsed = urlparse(url)
|
|
assert parsed.path.startswith("/packages")
|
|
prefix = "../.."
|
|
return prefix + parsed.path
|
|
|
|
@staticmethod
|
|
def file_url_to_local_path(url: str) -> Path:
|
|
"""
|
|
Unquote() and returns a Path
|
|
"""
|
|
path = urlparse(url).path
|
|
path = unquote(path)
|
|
assert path.startswith("/packages")
|
|
path = path[1:]
|
|
return Path("../..") / path
|
|
|
|
# Func modified from bandersnatch
|
|
@classmethod
|
|
def generate_html_simple_page(cls, package_meta: dict) -> str:
|
|
package_rawname = package_meta["info"]["name"]
|
|
simple_page_content = (
|
|
"<!DOCTYPE html>\n"
|
|
"<html>\n"
|
|
" <head>\n"
|
|
' <meta name="pypi:repository-version" content="{0}">\n'
|
|
" <title>Links for {1}</title>\n"
|
|
" </head>\n"
|
|
" <body>\n"
|
|
" <h1>Links for {1}</h1>\n"
|
|
).format("1.0", package_rawname)
|
|
|
|
release_files = cls.get_release_files_from_meta(package_meta)
|
|
|
|
def gen_html_file_tags(release: dict) -> str:
|
|
file_tags = ""
|
|
|
|
# data-requires-python: requires_python
|
|
if "requires_python" in release and release["requires_python"] is not None:
|
|
file_tags += (
|
|
f' data-requires-python="{html.escape(release["requires_python"])}"'
|
|
)
|
|
|
|
# data-yanked: yanked_reason
|
|
if "yanked" in release and release["yanked"]:
|
|
if "yanked_reason" in release and release["yanked_reason"]:
|
|
file_tags += (
|
|
f' data-yanked="{html.escape(release["yanked_reason"])}"'
|
|
)
|
|
else:
|
|
file_tags += ' data-yanked=""'
|
|
|
|
return file_tags
|
|
|
|
simple_page_content += "\n".join(
|
|
[
|
|
' <a href="{}#{}={}"{}>{}</a><br/>'.format(
|
|
cls.file_url_to_local_url(r["url"]),
|
|
cls.digest_name,
|
|
r["digests"][cls.digest_name],
|
|
gen_html_file_tags(r),
|
|
r["filename"],
|
|
)
|
|
for r in release_files
|
|
]
|
|
)
|
|
|
|
simple_page_content += (
|
|
f"\n </body>\n</html>\n<!--SERIAL {package_meta['last_serial']}-->"
|
|
)
|
|
|
|
return simple_page_content
|
|
|
|
# Func modified from bandersnatch
|
|
@classmethod
|
|
def generate_json_simple_page(cls, package_meta: dict) -> str:
|
|
package_json: dict[str, Any] = {
|
|
"files": [],
|
|
"meta": {
|
|
"api-version": "1.1",
|
|
"_last-serial": str(package_meta["last_serial"]),
|
|
},
|
|
"name": package_meta["info"]["name"],
|
|
# (bandsnatch) TODO: Just sorting by default sort - Maybe specify order in future PEP
|
|
"versions": sorted(package_meta["releases"].keys()),
|
|
}
|
|
|
|
release_files = cls.get_release_files_from_meta(package_meta)
|
|
|
|
# Add release files into the JSON dict
|
|
for r in release_files:
|
|
package_json["files"].append(
|
|
{
|
|
"filename": r["filename"],
|
|
"hashes": {
|
|
cls.digest_name: r["digests"][cls.digest_name],
|
|
},
|
|
"requires-python": r.get("requires_python", ""),
|
|
"size": r["size"],
|
|
"upload-time": r.get("upload_time_iso_8601", ""),
|
|
"url": cls.file_url_to_local_url(r["url"]),
|
|
"yanked": r.get("yanked", False),
|
|
}
|
|
)
|
|
|
|
return json.dumps(package_json)
|
|
|
|
|
|
# (normalized_name as key, value)
|
|
ShadowmirePackageItem = tuple[str, int]
|
|
|
|
|
|
@dataclass
|
|
class Plan:
|
|
remove: list[str]
|
|
update: list[str]
|
|
|
|
|
|
def match_patterns(
|
|
s: str, ps: list[re.Pattern[str]] | tuple[re.Pattern[str], ...]
|
|
) -> bool:
|
|
for p in ps:
|
|
if p.match(s):
|
|
return True
|
|
return False
|
|
|
|
|
|
class SyncBase:
|
|
def __init__(
|
|
self, basedir: Path, local_db: LocalVersionKV, sync_packages: bool = False
|
|
) -> None:
|
|
self.basedir = basedir
|
|
self.local_db = local_db
|
|
self.simple_dir = basedir / "simple"
|
|
self.packages_dir = basedir / "packages"
|
|
self.jsonmeta_dir = basedir / "json"
|
|
# create the dirs, if not exist
|
|
self.simple_dir.mkdir(parents=True, exist_ok=True)
|
|
self.packages_dir.mkdir(parents=True, exist_ok=True)
|
|
self.jsonmeta_dir.mkdir(parents=True, exist_ok=True)
|
|
self.sync_packages = sync_packages
|
|
|
|
def filter_remote_with_excludes(
|
|
self, remote: dict[str, int], excludes: list[re.Pattern[str]]
|
|
) -> dict[str, int]:
|
|
if not excludes:
|
|
return remote
|
|
res = {}
|
|
for k, v in remote.items():
|
|
matched = match_patterns(k, excludes)
|
|
if not matched:
|
|
res[k] = v
|
|
return res
|
|
|
|
def determine_sync_plan(
|
|
self, local: dict[str, int], excludes: list[re.Pattern[str]]
|
|
) -> Plan:
|
|
"""
|
|
local should NOT skip invalid (-1) serials
|
|
"""
|
|
remote = self.fetch_remote_versions()
|
|
remote = self.filter_remote_with_excludes(remote, excludes)
|
|
with open(self.basedir / "remote_excluded.json", "w") as f:
|
|
json.dump(remote, f)
|
|
|
|
to_remove = []
|
|
to_update = []
|
|
local_keys = set(local.keys())
|
|
remote_keys = set(remote.keys())
|
|
for i in local_keys - remote_keys:
|
|
to_remove.append(i)
|
|
local_keys.remove(i)
|
|
# There are always some packages in PyPI's list_packages_with_serial() but actually not there
|
|
# Don't count them when comparing len(to_remove) with MAX_DELETION
|
|
if len(to_remove) > MAX_DELETION:
|
|
logger.error(
|
|
"Too many packages to remove (%d > %d)", len(to_remove), MAX_DELETION
|
|
)
|
|
logger.info("Some packages that would be removed:")
|
|
for p in to_remove[:100]:
|
|
logger.info("- %s", p)
|
|
for p in to_remove[100:]:
|
|
logger.debug("- %s", p)
|
|
logger.error(
|
|
"Use SHADOWMIRE_MAX_DELETION env to adjust the threshold if you really want to proceed"
|
|
)
|
|
sys.exit(2)
|
|
for i in remote_keys - local_keys:
|
|
to_update.append(i)
|
|
for i in local_keys:
|
|
local_serial = local[i]
|
|
remote_serial = remote[i]
|
|
if local_serial != remote_serial:
|
|
if local_serial == -1:
|
|
logger.info("skip %s, as it's marked as not exist at upstream", i)
|
|
to_remove.append(i)
|
|
else:
|
|
to_update.append(i)
|
|
output = Plan(remove=to_remove, update=to_update)
|
|
return output
|
|
|
|
def fetch_remote_versions(self) -> dict[str, int]:
|
|
raise NotImplementedError
|
|
|
|
def check_and_update(
|
|
self,
|
|
package_names: list[str],
|
|
prerelease_excludes: list[re.Pattern[str]],
|
|
json_files: set[str],
|
|
packages_pathcache: set[str],
|
|
compare_size: bool,
|
|
) -> bool:
|
|
def is_consistent(package_name: str) -> bool:
|
|
if package_name not in json_files:
|
|
# save a newfstatat() when name already in json_files
|
|
logger.info("add %s as it does not have json API file", package_name)
|
|
return False
|
|
package_simple_path = self.simple_dir / package_name
|
|
html_simple = package_simple_path / "index.html"
|
|
htmlv1_simple = package_simple_path / "index.v1_html"
|
|
json_simple = package_simple_path / "index.v1_json"
|
|
try:
|
|
# always create index.html symlink, if not exists or not a symlink
|
|
if not html_simple.is_symlink():
|
|
html_simple.unlink(missing_ok=True)
|
|
html_simple.symlink_to("index.v1_html")
|
|
hrefs_html = get_package_urls_from_index_html(htmlv1_simple)
|
|
hrefsize_json = get_package_urls_size_from_index_json(json_simple)
|
|
except FileNotFoundError:
|
|
logger.info(
|
|
"add %s as it does not have index.v1_html or index.v1_json",
|
|
package_name,
|
|
)
|
|
return False
|
|
if (
|
|
hrefs_html is None
|
|
or hrefsize_json is None
|
|
or hrefs_html != [i[0] for i in hrefsize_json]
|
|
):
|
|
# something unexpected happens...
|
|
logger.info("add %s as its indexes are not consistent", package_name)
|
|
return False
|
|
|
|
# OK, check if all hrefs have corresponding files
|
|
if self.sync_packages:
|
|
for href, size in hrefsize_json:
|
|
relative_path = unquote(href)
|
|
dest_pathstr = normpath(package_simple_path / relative_path)
|
|
try:
|
|
# Fast shortcut to avoid stat() it
|
|
if dest_pathstr not in packages_pathcache:
|
|
raise FileNotFoundError
|
|
if compare_size and size != -1:
|
|
dest = Path(dest_pathstr)
|
|
# So, do stat() for real only when we need to do so,
|
|
# have a size, and it really exists in pathcache.
|
|
dest_stat = dest.stat()
|
|
dest_size = dest_stat.st_size
|
|
if dest_size != size:
|
|
logger.info(
|
|
"add %s as its local size %s != %s",
|
|
package_name,
|
|
dest_size,
|
|
size,
|
|
)
|
|
return False
|
|
except FileNotFoundError:
|
|
logger.info("add %s as it's missing packages", package_name)
|
|
return False
|
|
|
|
return True
|
|
|
|
to_update = []
|
|
with ThreadPoolExecutor(max_workers=IOWORKERS) as executor:
|
|
futures = {
|
|
executor.submit(is_consistent, package_name): package_name
|
|
for package_name in package_names
|
|
}
|
|
try:
|
|
for future in tqdm(
|
|
as_completed(futures),
|
|
total=len(package_names),
|
|
desc="Checking consistency",
|
|
):
|
|
package_name = futures[future]
|
|
try:
|
|
consistent = future.result()
|
|
if not consistent:
|
|
to_update.append(package_name)
|
|
except Exception:
|
|
logger.warning(
|
|
"%s generated an exception", package_name, exc_info=True
|
|
)
|
|
raise
|
|
except:
|
|
exit_with_futures(futures)
|
|
|
|
logger.info("%s packages to update in check_and_update()", len(to_update))
|
|
return self.parallel_update(to_update, prerelease_excludes)
|
|
|
|
def parallel_update(
|
|
self, package_names: list[str], prerelease_excludes: list[re.Pattern[str]]
|
|
) -> bool:
|
|
success = True
|
|
with ThreadPoolExecutor(max_workers=WORKERS) as executor:
|
|
futures = {
|
|
executor.submit(
|
|
self.do_update, package_name, prerelease_excludes, False
|
|
): (
|
|
idx,
|
|
package_name,
|
|
)
|
|
for idx, package_name in enumerate(package_names)
|
|
}
|
|
try:
|
|
for future in tqdm(
|
|
as_completed(futures), total=len(package_names), desc="Updating"
|
|
):
|
|
idx, package_name = futures[future]
|
|
try:
|
|
serial = future.result()
|
|
if serial:
|
|
self.local_db.set(package_name, serial)
|
|
except Exception as e:
|
|
if isinstance(e, (KeyboardInterrupt)):
|
|
raise
|
|
logger.warning(
|
|
"%s generated an exception", package_name, exc_info=True
|
|
)
|
|
success = False
|
|
if idx % 100 == 0:
|
|
logger.info("dumping local db...")
|
|
self.local_db.dump_json()
|
|
except (ExitProgramException, KeyboardInterrupt):
|
|
exit_with_futures(futures)
|
|
return success
|
|
|
|
def do_sync_plan(
|
|
self, plan: Plan, prerelease_excludes: list[re.Pattern[str]]
|
|
) -> bool:
|
|
to_remove = plan.remove
|
|
to_update = plan.update
|
|
|
|
for package_name in to_remove:
|
|
self.do_remove(package_name)
|
|
|
|
return self.parallel_update(to_update, prerelease_excludes)
|
|
|
|
def do_remove(
|
|
self, package_name: str, use_db: bool = True, remove_packages: bool = True
|
|
) -> None:
|
|
metajson_path = self.jsonmeta_dir / package_name
|
|
package_simple_dir = self.simple_dir / package_name
|
|
if metajson_path.exists() or package_simple_dir.exists():
|
|
# To make this less noisy...
|
|
logger.info("Removing package %s", package_name)
|
|
packages_to_remove = get_existing_hrefs(package_simple_dir)
|
|
if remove_packages and packages_to_remove:
|
|
paths_to_remove = [package_simple_dir / p for p in packages_to_remove]
|
|
for p in paths_to_remove:
|
|
if p.exists():
|
|
p.unlink()
|
|
logger.info("Removed file %s", p)
|
|
remove_dir_with_files(package_simple_dir)
|
|
metajson_path = self.jsonmeta_dir / package_name
|
|
metajson_path.unlink(missing_ok=True)
|
|
if use_db:
|
|
old_serial = self.local_db.get(package_name)
|
|
if old_serial != -1:
|
|
self.local_db.remove(package_name)
|
|
|
|
def do_update(
|
|
self,
|
|
package_name: str,
|
|
prerelease_excludes: list[re.Pattern[str]],
|
|
use_db: bool = True,
|
|
) -> Optional[int]:
|
|
raise NotImplementedError
|
|
|
|
def write_meta_to_simple(self, package_simple_path: Path, meta: dict) -> None:
|
|
simple_html_contents = PyPI.generate_html_simple_page(meta)
|
|
simple_json_contents = PyPI.generate_json_simple_page(meta)
|
|
for html_filename in ("index.v1_html",):
|
|
html_path = package_simple_path / html_filename
|
|
with overwrite(html_path) as f:
|
|
f.write(simple_html_contents)
|
|
for json_filename in ("index.v1_json",):
|
|
json_path = package_simple_path / json_filename
|
|
with overwrite(json_path) as f:
|
|
f.write(simple_json_contents)
|
|
index_html_path = package_simple_path / "index.html"
|
|
if not index_html_path.is_symlink():
|
|
if index_html_path.exists():
|
|
index_html_path.unlink()
|
|
index_html_path.symlink_to("index.v1_html")
|
|
|
|
def finalize(self) -> None:
|
|
local_names = self.local_db.keys()
|
|
# generate index.html at basedir
|
|
index_path = self.basedir / "simple" / "index.html"
|
|
# modified from bandersnatch
|
|
with overwrite(index_path) as f:
|
|
f.write("<!DOCTYPE html>\n")
|
|
f.write("<html>\n")
|
|
f.write(" <head>\n")
|
|
f.write(' <meta name="pypi:repository-version" content="1.0">\n')
|
|
f.write(" <title>Simple Index</title>\n")
|
|
f.write(" </head>\n")
|
|
f.write(" <body>\n")
|
|
# This will either be the simple dir, or if we are using index
|
|
# directory hashing, a list of subdirs to process.
|
|
for pkg in local_names:
|
|
# We're really trusty that this is all encoded in UTF-8. :/
|
|
f.write(f' <a href="{pkg}/">{pkg}</a><br/>\n')
|
|
f.write(" </body>\n</html>")
|
|
self.local_db.dump_json()
|
|
|
|
def skip_this_package(self, i: dict, dest: Path) -> bool:
|
|
"""
|
|
A helper function for subclasses implementing do_update().
|
|
As existence check is also done with stat(), this would not bring extra I/O overhead.
|
|
Returns if skip this package or not.
|
|
"""
|
|
try:
|
|
dest_size = dest.stat().st_size
|
|
i_size = i.get("size", -1)
|
|
if i_size == -1:
|
|
return True
|
|
if dest_size == i_size:
|
|
return True
|
|
logger.warning(
|
|
"file %s exists locally, but size does not match with upstream, so it would still be downloaded.",
|
|
dest,
|
|
)
|
|
return False
|
|
except FileNotFoundError:
|
|
return False
|
|
|
|
|
|
def download(
|
|
session: requests.Session, url: str, dest: Path
|
|
) -> tuple[bool, Optional[requests.Response]]:
|
|
try:
|
|
resp = session.get(url, allow_redirects=True)
|
|
except requests.RequestException:
|
|
logger.warning("download %s failed with exception", exc_info=True)
|
|
return False, None
|
|
if resp.status_code >= 400:
|
|
logger.warning(
|
|
"download %s failed with status %s, skipping this package",
|
|
url,
|
|
resp.status_code,
|
|
)
|
|
return False, resp
|
|
with overwrite(dest, "wb") as f:
|
|
f.write(resp.content)
|
|
return True, resp
|
|
|
|
|
|
def filter_release_from_meta(
|
|
meta: dict, patterns: list[re.Pattern[str]] | tuple[re.Pattern[str], ...]
|
|
) -> None:
|
|
for release in list(meta["releases"].keys()):
|
|
if match_patterns(release, patterns):
|
|
del meta["releases"][release]
|
|
|
|
|
|
class SyncPyPI(SyncBase):
|
|
def __init__(
|
|
self, basedir: Path, local_db: LocalVersionKV, sync_packages: bool = False
|
|
) -> None:
|
|
self.pypi = PyPI()
|
|
self.session = create_requests_session()
|
|
self.last_serial: Optional[int] = None
|
|
self.remote_packages: Optional[dict[str, int]] = None
|
|
super().__init__(basedir, local_db, sync_packages)
|
|
|
|
def fetch_remote_versions(self) -> dict[str, int]:
|
|
self.last_serial = self.pypi.changelog_last_serial()
|
|
self.remote_packages = self.pypi.list_packages_with_serial()
|
|
logger.info("Remote has %s packages", len(self.remote_packages))
|
|
with overwrite(self.basedir / "remote.json") as f:
|
|
json.dump(self.remote_packages, f)
|
|
logger.info("File saved to remote.json.")
|
|
return self.remote_packages
|
|
|
|
def do_update(
|
|
self,
|
|
package_name: str,
|
|
prerelease_excludes: list[re.Pattern[str]],
|
|
use_db: bool = True,
|
|
) -> Optional[int]:
|
|
logger.info("updating %s", package_name)
|
|
package_simple_path = self.simple_dir / package_name
|
|
package_simple_path.mkdir(exist_ok=True)
|
|
try:
|
|
meta = self.pypi.get_package_metadata(package_name)
|
|
meta_original = deepcopy(meta)
|
|
logger.debug("%s meta: %s", package_name, meta)
|
|
except PackageNotFoundError:
|
|
if (
|
|
self.remote_packages is not None
|
|
and package_name in self.remote_packages
|
|
):
|
|
recorded_serial = self.remote_packages[package_name]
|
|
else:
|
|
recorded_serial = None
|
|
if (
|
|
recorded_serial is not None
|
|
and self.last_serial is not None
|
|
and abs(recorded_serial - self.last_serial) < IGNORE_THRESHOLD
|
|
):
|
|
logger.warning(
|
|
"%s missing from upstream (its serial %s, remote last serial %s), try next time...",
|
|
package_name,
|
|
recorded_serial,
|
|
self.last_serial,
|
|
)
|
|
return None
|
|
|
|
logger.warning(
|
|
"%s missing from upstream (its serial %s, remote last serial %s), remove and ignore in the future.",
|
|
package_name,
|
|
recorded_serial,
|
|
self.last_serial,
|
|
)
|
|
# try remove it locally, if it does not exist upstream
|
|
self.do_remove(package_name, use_db=False)
|
|
if not use_db:
|
|
return -1
|
|
self.local_db.set(package_name, -1)
|
|
return None
|
|
|
|
# filter prerelease, if necessary
|
|
if match_patterns(package_name, prerelease_excludes):
|
|
filter_release_from_meta(meta, PRERELEASE_PATTERNS)
|
|
|
|
if self.sync_packages:
|
|
# sync packages first, then sync index
|
|
existing_hrefs = get_existing_hrefs(package_simple_path)
|
|
existing_hrefs = [] if existing_hrefs is None else existing_hrefs
|
|
release_files = PyPI.get_release_files_from_meta(meta)
|
|
# remove packages that no longer exist remotely
|
|
remote_hrefs = [
|
|
self.pypi.file_url_to_local_url(i["url"]) for i in release_files
|
|
]
|
|
should_remove = list(set(existing_hrefs) - set(remote_hrefs))
|
|
for href in should_remove:
|
|
p = unquote(href)
|
|
logger.info("removing file %s (if exists)", p)
|
|
package_path = Path(normpath(package_simple_path / p))
|
|
package_path.unlink(missing_ok=True)
|
|
for i in release_files:
|
|
url = i["url"]
|
|
dest = Path(
|
|
normpath(
|
|
package_simple_path / self.pypi.file_url_to_local_path(i["url"])
|
|
)
|
|
)
|
|
logger.info("downloading file %s -> %s", url, dest)
|
|
if self.skip_this_package(i, dest):
|
|
continue
|
|
|
|
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
success, _resp = download(self.session, url, dest)
|
|
if not success:
|
|
logger.warning("skipping %s as it fails downloading", package_name)
|
|
return None
|
|
|
|
last_serial: int = meta["last_serial"]
|
|
|
|
self.write_meta_to_simple(package_simple_path, meta)
|
|
json_meta_path = self.jsonmeta_dir / package_name
|
|
with overwrite(json_meta_path) as f:
|
|
# Note that we're writing meta_original here!
|
|
json.dump(meta_original, f)
|
|
|
|
if use_db:
|
|
self.local_db.set(package_name, last_serial)
|
|
|
|
return last_serial
|
|
|
|
|
|
class SyncPlainHTTP(SyncBase):
|
|
def __init__(
|
|
self,
|
|
upstream: str,
|
|
basedir: Path,
|
|
local_db: LocalVersionKV,
|
|
sync_packages: bool = False,
|
|
use_pypi_index: bool = False,
|
|
) -> None:
|
|
self.upstream = upstream
|
|
self.session = create_requests_session()
|
|
self.pypi: Optional[PyPI]
|
|
if use_pypi_index:
|
|
self.pypi = PyPI()
|
|
else:
|
|
self.pypi = None
|
|
super().__init__(basedir, local_db, sync_packages)
|
|
|
|
def fetch_remote_versions(self) -> dict[str, int]:
|
|
remote: dict[str, int]
|
|
if not self.pypi:
|
|
remote_url = urljoin(self.upstream, "local.json")
|
|
resp = self.session.get(remote_url)
|
|
resp.raise_for_status()
|
|
remote = resp.json()
|
|
else:
|
|
remote = self.pypi.list_packages_with_serial()
|
|
logger.info("Remote has %s packages", len(remote))
|
|
with overwrite(self.basedir / "remote.json") as f:
|
|
json.dump(remote, f)
|
|
logger.info("File saved to remote.json.")
|
|
return remote
|
|
|
|
def do_update(
|
|
self,
|
|
package_name: str,
|
|
prerelease_excludes: list[re.Pattern[str]],
|
|
use_db: bool = True,
|
|
) -> Optional[int]:
|
|
logger.info("updating %s", package_name)
|
|
package_simple_path = self.simple_dir / package_name
|
|
package_simple_path.mkdir(exist_ok=True)
|
|
if self.sync_packages:
|
|
hrefs = get_existing_hrefs(package_simple_path)
|
|
existing_hrefs = [] if hrefs is None else hrefs
|
|
# Download JSON meta
|
|
file_url = urljoin(self.upstream, f"json/{package_name}")
|
|
success, resp = download(
|
|
self.session, file_url, self.jsonmeta_dir / (package_name + ".new")
|
|
)
|
|
if not success:
|
|
logger.error(
|
|
"download %s JSON meta fails with code %s",
|
|
package_name,
|
|
resp.status_code if resp else None,
|
|
)
|
|
return None
|
|
assert resp
|
|
meta = resp.json()
|
|
# filter prerelease, if necessary
|
|
if match_patterns(package_name, prerelease_excludes):
|
|
filter_release_from_meta(meta, PRERELEASE_PATTERNS)
|
|
|
|
if self.sync_packages:
|
|
release_files = PyPI.get_release_files_from_meta(meta)
|
|
remote_hrefs = [PyPI.file_url_to_local_url(i["url"]) for i in release_files]
|
|
should_remove = list(set(existing_hrefs) - set(remote_hrefs))
|
|
for href in should_remove:
|
|
p = unquote(href)
|
|
logger.info("removing file %s (if exists)", p)
|
|
package_path = Path(normpath(package_simple_path / p))
|
|
package_path.unlink(missing_ok=True)
|
|
package_simple_url = urljoin(self.upstream, f"simple/{package_name}/")
|
|
for i in release_files:
|
|
href = PyPI.file_url_to_local_url(i["url"])
|
|
path = PyPI.file_url_to_local_path(i["url"])
|
|
url = urljoin(package_simple_url, href)
|
|
dest = Path(normpath(package_simple_path / path))
|
|
logger.info("downloading file %s -> %s", url, dest)
|
|
if self.skip_this_package(i, dest):
|
|
continue
|
|
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
success, resp = download(self.session, url, dest)
|
|
if not success:
|
|
logger.warning("skipping %s as it fails downloading", package_name)
|
|
return None
|
|
|
|
# OK, now it's safe to rename
|
|
(self.jsonmeta_dir / (package_name + ".new")).rename(
|
|
self.jsonmeta_dir / package_name
|
|
)
|
|
# generate indexes
|
|
self.write_meta_to_simple(package_simple_path, meta)
|
|
|
|
last_serial: int = meta["last_serial"]
|
|
if use_db:
|
|
self.local_db.set(package_name, last_serial)
|
|
|
|
return last_serial
|
|
|
|
|
|
def get_local_serial(package_meta_direntry: os.DirEntry[str]) -> Optional[int]:
|
|
"""
|
|
Accepts /json/<package_name> as package_meta_path
|
|
"""
|
|
package_name = package_meta_direntry.name
|
|
try:
|
|
contents = fast_readall(Path(package_meta_direntry.path))
|
|
except FileNotFoundError:
|
|
logger.warning("%s does not have JSON metadata, skipping", package_name)
|
|
return None
|
|
try:
|
|
meta = json.loads(contents)
|
|
return meta["last_serial"] # type: ignore
|
|
except Exception:
|
|
logger.warning("cannot parse %s's JSON metadata", package_name, exc_info=True)
|
|
return None
|
|
|
|
|
|
def sync_shared_args(func: Callable[..., Any]) -> Callable[..., Any]:
|
|
shared_options = [
|
|
click.option(
|
|
"--sync-packages/--no-sync-packages",
|
|
default=False,
|
|
help="Sync packages instead of just indexes, by default it's --no-sync-packages",
|
|
),
|
|
click.option(
|
|
"--shadowmire-upstream",
|
|
required=False,
|
|
type=str,
|
|
help="Use another upstream using shadowmire instead of PyPI",
|
|
),
|
|
click.option(
|
|
"--use-pypi-index/--no-use-pypi-index",
|
|
default=False,
|
|
help="Always use PyPI index metadata (via XMLRPC). It's no-op without --shadowmire-upstream. Some packages might not be downloaded successfully.",
|
|
),
|
|
click.option(
|
|
"--exclude", multiple=True, help="Remote package names to exclude. Regex."
|
|
),
|
|
click.option(
|
|
"--prerelease-exclude",
|
|
multiple=True,
|
|
help="Package names of which prereleases will be excluded. Regex.",
|
|
),
|
|
]
|
|
for option in shared_options[::-1]:
|
|
func = option(func)
|
|
return func
|
|
|
|
|
|
def read_config(
|
|
ctx: click.Context, param: click.Option, filename: Optional[str]
|
|
) -> None:
|
|
# Set default repo as cwd
|
|
ctx.default_map = {}
|
|
ctx.default_map["repo"] = "."
|
|
|
|
if filename is None:
|
|
return
|
|
with open(filename, "rb") as f:
|
|
data = tomllib.load(f)
|
|
try:
|
|
options = dict(data["options"])
|
|
except KeyError:
|
|
options = {}
|
|
if options.get("repo"):
|
|
ctx.default_map["repo"] = options["repo"]
|
|
del options["repo"]
|
|
|
|
logger.info("Read options from %s: %s", filename, options)
|
|
|
|
ctx.default_map["sync"] = options
|
|
ctx.default_map["verify"] = options
|
|
ctx.default_map["do-update"] = options
|
|
ctx.default_map["do-remove"] = options
|
|
|
|
|
|
@click.group()
|
|
@click.option(
|
|
"--config",
|
|
type=click.Path(dir_okay=False),
|
|
help="Read option defaults from specified TOML file",
|
|
callback=read_config,
|
|
expose_value=False,
|
|
)
|
|
@click.option("--repo", type=click.Path(file_okay=False), help="Repo (basedir) path")
|
|
@click.pass_context
|
|
def cli(ctx: click.Context, repo: str) -> None:
|
|
log_level = logging.DEBUG if os.environ.get("DEBUG") else logging.INFO
|
|
logging.basicConfig(level=log_level, format=LOG_FORMAT)
|
|
ctx.ensure_object(dict)
|
|
|
|
if WORKERS > 10:
|
|
logger.warning(
|
|
"You have set a worker value larger than 10, which is forbidden by PyPI maintainers."
|
|
)
|
|
logger.warning("Don't blame me if you were banned!")
|
|
|
|
# Make sure basedir is absolute
|
|
basedir = Path(repo).resolve()
|
|
local_db = LocalVersionKV(basedir / "local.db", basedir / "local.json")
|
|
|
|
ctx.obj["basedir"] = basedir
|
|
ctx.obj["local_db"] = local_db
|
|
|
|
|
|
def exclude_to_excludes(exclude: tuple[str]) -> list[re.Pattern[str]]:
|
|
return [re.compile(i) for i in exclude]
|
|
|
|
|
|
def get_syncer(
|
|
basedir: Path,
|
|
local_db: LocalVersionKV,
|
|
sync_packages: bool,
|
|
shadowmire_upstream: Optional[str],
|
|
use_pypi_index: bool,
|
|
) -> SyncBase:
|
|
syncer: SyncBase
|
|
if shadowmire_upstream:
|
|
syncer = SyncPlainHTTP(
|
|
upstream=shadowmire_upstream,
|
|
basedir=basedir,
|
|
local_db=local_db,
|
|
sync_packages=sync_packages,
|
|
use_pypi_index=use_pypi_index,
|
|
)
|
|
else:
|
|
syncer = SyncPyPI(
|
|
basedir=basedir, local_db=local_db, sync_packages=sync_packages
|
|
)
|
|
return syncer
|
|
|
|
|
|
@cli.command(help="Sync from upstream")
|
|
@click.pass_context
|
|
@sync_shared_args
|
|
def sync(
|
|
ctx: click.Context,
|
|
sync_packages: bool,
|
|
shadowmire_upstream: Optional[str],
|
|
exclude: tuple[str],
|
|
prerelease_exclude: tuple[str],
|
|
use_pypi_index: bool,
|
|
) -> None:
|
|
basedir: Path = ctx.obj["basedir"]
|
|
local_db: LocalVersionKV = ctx.obj["local_db"]
|
|
excludes = exclude_to_excludes(exclude)
|
|
prerelease_excludes = exclude_to_excludes(prerelease_exclude)
|
|
syncer = get_syncer(
|
|
basedir, local_db, sync_packages, shadowmire_upstream, use_pypi_index
|
|
)
|
|
local = local_db.dump(skip_invalid=False)
|
|
plan = syncer.determine_sync_plan(local, excludes)
|
|
# save plan for debugging
|
|
with overwrite(basedir / "plan.json") as f:
|
|
json.dump(plan, f, default=vars, indent=2)
|
|
success = syncer.do_sync_plan(plan, prerelease_excludes)
|
|
syncer.finalize()
|
|
|
|
logger.info("Synchronization finished. Success: %s", success)
|
|
|
|
if not success:
|
|
sys.exit(1)
|
|
|
|
|
|
@cli.command(help="(Re)generate local db and json from json/")
|
|
@click.pass_context
|
|
def genlocal(ctx: click.Context) -> None:
|
|
basedir: Path = ctx.obj["basedir"]
|
|
local_db: LocalVersionKV = ctx.obj["local_db"]
|
|
local = {}
|
|
json_dir = basedir / "json"
|
|
logger.info("Iterating all items under %s", json_dir)
|
|
dir_items = [d for d in fast_iterdir(json_dir, "file")]
|
|
logger.info("Detected %s packages in %s in total", len(dir_items), json_dir)
|
|
with ThreadPoolExecutor(max_workers=IOWORKERS) as executor:
|
|
futures = {
|
|
executor.submit(get_local_serial, package_metapath): package_metapath
|
|
for package_metapath in dir_items
|
|
}
|
|
try:
|
|
for future in tqdm(
|
|
as_completed(futures),
|
|
total=len(dir_items),
|
|
desc="Reading packages from json/",
|
|
):
|
|
package_name = futures[future].name
|
|
try:
|
|
serial = future.result()
|
|
if serial:
|
|
local[package_name] = serial
|
|
except Exception as e:
|
|
if isinstance(e, (KeyboardInterrupt)):
|
|
raise
|
|
logger.warning(
|
|
"%s generated an exception", package_name, exc_info=True
|
|
)
|
|
except (ExitProgramException, KeyboardInterrupt):
|
|
exit_with_futures(futures)
|
|
logger.info(
|
|
"%d out of %d packages have valid serial number", len(local), len(dir_items)
|
|
)
|
|
local_db.nuke(commit=False)
|
|
local_db.batch_set(local)
|
|
local_db.dump_json()
|
|
|
|
|
|
@cli.command(
|
|
help="Verify existing sync from local db, download missing things, remove unreferenced packages"
|
|
)
|
|
@click.pass_context
|
|
@sync_shared_args
|
|
@click.option(
|
|
"--remove-not-in-local", is_flag=True, help="Do step 1 instead of skipping"
|
|
)
|
|
@click.option(
|
|
"--compare-size",
|
|
is_flag=True,
|
|
help="Instead of just check if it exists, also compare local package size when possible, to decide if local package file is valid",
|
|
)
|
|
def verify(
|
|
ctx: click.Context,
|
|
sync_packages: bool,
|
|
shadowmire_upstream: Optional[str],
|
|
exclude: tuple[str],
|
|
prerelease_exclude: tuple[str],
|
|
remove_not_in_local: bool,
|
|
compare_size: bool,
|
|
use_pypi_index: bool,
|
|
) -> None:
|
|
basedir: Path = ctx.obj["basedir"]
|
|
local_db: LocalVersionKV = ctx.obj["local_db"]
|
|
excludes = exclude_to_excludes(exclude)
|
|
prerelease_excludes = exclude_to_excludes(prerelease_exclude)
|
|
syncer = get_syncer(
|
|
basedir, local_db, sync_packages, shadowmire_upstream, use_pypi_index
|
|
)
|
|
|
|
logger.info("====== Step 1. Remove packages NOT in local db ======")
|
|
local_names = set(local_db.keys())
|
|
simple_dirs = {i.name for i in fast_iterdir((basedir / "simple"), "dir")}
|
|
json_files = {i.name for i in fast_iterdir((basedir / "json"), "file")}
|
|
not_in_local = (simple_dirs | json_files) - local_names
|
|
logger.info(
|
|
"%d out of %d local packages NOT in local db",
|
|
len(not_in_local),
|
|
len(local_names),
|
|
)
|
|
for package_name in not_in_local:
|
|
logger.info("package %s not in local db", package_name)
|
|
if remove_not_in_local:
|
|
# Old bandersnatch would download packages without normalization,
|
|
# in which case one package file could have multiple "packages"
|
|
# with different names, but normalized to the same one.
|
|
# So, when in verify, we always set remove_packages=False
|
|
# In step 4 unreferenced files would be removed, anyway.
|
|
syncer.do_remove(package_name, remove_packages=False)
|
|
|
|
logger.info("====== Step 2. Remove packages NOT in remote index ======")
|
|
local = local_db.dump(skip_invalid=False)
|
|
plan = syncer.determine_sync_plan(local, excludes)
|
|
logger.info(
|
|
"%s packages NOT in remote index -- this might contain packages that also do not exist locally",
|
|
len(plan.remove),
|
|
)
|
|
for package_name in plan.remove:
|
|
# We only take the plan.remove part here
|
|
logger.info("package %s not in remote index", package_name)
|
|
syncer.do_remove(package_name, remove_packages=False)
|
|
|
|
# After some removal, local_names is changed.
|
|
local_names = set(local_db.keys())
|
|
|
|
logger.info("====== Step 3. Caching packages/ dirtree in memory for Step 4 & 5.")
|
|
packages_pathcache: set[str] = set()
|
|
with ThreadPoolExecutor(max_workers=IOWORKERS) as executor:
|
|
|
|
def packages_iterate(first_dirname: str, position: int) -> list[str]:
|
|
with tqdm(
|
|
desc=f"Iterating packages/{first_dirname}/*/*/*", position=position
|
|
) as pb:
|
|
res = []
|
|
for d1 in fast_iterdir(basedir / "packages" / first_dirname, "dir"):
|
|
for d2 in fast_iterdir(d1.path, "dir"):
|
|
for file in fast_iterdir(d2.path, "file"):
|
|
pb.update(1)
|
|
res.append(file.path)
|
|
return res
|
|
|
|
futures = {
|
|
executor.submit(packages_iterate, first_dir.name, idx % IOWORKERS): first_dir.name # type: ignore
|
|
for idx, first_dir in enumerate(fast_iterdir((basedir / "packages"), "dir"))
|
|
}
|
|
try:
|
|
for future in as_completed(futures):
|
|
sname = futures[future]
|
|
try:
|
|
for p in future.result():
|
|
packages_pathcache.add(p)
|
|
except Exception as e:
|
|
if isinstance(e, (KeyboardInterrupt)):
|
|
raise
|
|
logger.warning("%s generated an exception", sname, exc_info=True)
|
|
success = False
|
|
except (ExitProgramException, KeyboardInterrupt):
|
|
exit_with_futures(futures)
|
|
|
|
logger.info(
|
|
"====== Step 4. Make sure all local indexes are valid, and (if --sync-packages) have valid local package files ======"
|
|
)
|
|
success = syncer.check_and_update(
|
|
list(local_names),
|
|
prerelease_excludes,
|
|
json_files,
|
|
packages_pathcache,
|
|
compare_size,
|
|
)
|
|
syncer.finalize()
|
|
|
|
logger.info(
|
|
"====== Step 5. Remove any unreferenced files in `packages` folder ======"
|
|
)
|
|
ref_set: set[str] = set()
|
|
with ThreadPoolExecutor(max_workers=IOWORKERS) as executor:
|
|
# Part 1: iterate simple/
|
|
def iterate_simple(sname: str) -> list[str]:
|
|
sd = basedir / "simple" / sname
|
|
hrefs = get_existing_hrefs(sd)
|
|
hrefs = [] if hrefs is None else hrefs
|
|
nps = []
|
|
for href in hrefs:
|
|
i = unquote(href)
|
|
# use normpath, which is much faster than pathlib resolve(), as it does not need to access fs
|
|
# we could make sure no symlinks could affect this here
|
|
np = normpath(sd / i)
|
|
logger.debug("add to ref_set: %s", np)
|
|
nps.append(np)
|
|
return nps
|
|
|
|
# MyPy does not enjoy same variable name with different types, even when --allow-redefinition
|
|
# Ignore here to make mypy happy
|
|
futures = {
|
|
executor.submit(iterate_simple, sname): sname for sname in simple_dirs # type: ignore
|
|
}
|
|
try:
|
|
for future in tqdm(
|
|
as_completed(futures),
|
|
total=len(simple_dirs),
|
|
desc="Iterating simple/ directory",
|
|
):
|
|
sname = futures[future]
|
|
try:
|
|
nps = future.result()
|
|
for np in nps:
|
|
ref_set.add(np)
|
|
except Exception as e:
|
|
if isinstance(e, (KeyboardInterrupt)):
|
|
raise
|
|
logger.warning("%s generated an exception", sname, exc_info=True)
|
|
success = False
|
|
except (ExitProgramException, KeyboardInterrupt):
|
|
exit_with_futures(futures)
|
|
|
|
# Part 2: handling packages
|
|
for path in tqdm(packages_pathcache, desc="Iterating path cache"):
|
|
if path not in ref_set:
|
|
logger.info("removing unreferenced file %s", path)
|
|
Path(path).unlink()
|
|
|
|
logger.info("Verification finished. Success: %s", success)
|
|
|
|
if not success:
|
|
sys.exit(1)
|
|
|
|
|
|
@cli.command(help="Manual update given package for debugging purpose")
|
|
@click.pass_context
|
|
@sync_shared_args
|
|
@click.argument("package_name")
|
|
def do_update(
|
|
ctx: click.Context,
|
|
sync_packages: bool,
|
|
shadowmire_upstream: Optional[str],
|
|
exclude: tuple[str],
|
|
prerelease_exclude: tuple[str],
|
|
use_pypi_index: bool,
|
|
package_name: str,
|
|
) -> None:
|
|
basedir: Path = ctx.obj["basedir"]
|
|
local_db: LocalVersionKV = ctx.obj["local_db"]
|
|
excludes = exclude_to_excludes(exclude)
|
|
if excludes:
|
|
logger.warning("--exclude is ignored in do_update()")
|
|
prerelease_excludes = exclude_to_excludes(prerelease_exclude)
|
|
syncer = get_syncer(
|
|
basedir, local_db, sync_packages, shadowmire_upstream, use_pypi_index
|
|
)
|
|
syncer.do_update(package_name, prerelease_excludes)
|
|
|
|
|
|
@cli.command(help="Manual remove given package for debugging purpose")
|
|
@click.pass_context
|
|
@sync_shared_args
|
|
@click.argument("package_name")
|
|
def do_remove(
|
|
ctx: click.Context,
|
|
sync_packages: bool,
|
|
shadowmire_upstream: Optional[str],
|
|
exclude: tuple[str],
|
|
prerelease_exclude: tuple[str],
|
|
use_pypi_index: bool,
|
|
package_name: str,
|
|
) -> None:
|
|
basedir = ctx.obj["basedir"]
|
|
local_db = ctx.obj["local_db"]
|
|
if exclude or prerelease_exclude:
|
|
logger.warning("exclusion rules are ignored in do_remove()")
|
|
syncer = get_syncer(
|
|
basedir, local_db, sync_packages, shadowmire_upstream, use_pypi_index
|
|
)
|
|
syncer.do_remove(package_name)
|
|
|
|
|
|
@cli.command(help="Call pypi list_packages_with_serial() for debugging")
|
|
@click.pass_context
|
|
def list_packages_with_serial(ctx: click.Context) -> None:
|
|
basedir = ctx.obj["basedir"]
|
|
local_db = ctx.obj["local_db"]
|
|
syncer = SyncPyPI(basedir, local_db)
|
|
syncer.fetch_remote_versions()
|
|
|
|
|
|
@cli.command(help="Clear invalid package status in local database")
|
|
@click.pass_context
|
|
def clear_invalid_packages(ctx: click.Context) -> None:
|
|
local_db: LocalVersionKV = ctx.obj["local_db"]
|
|
total = local_db.remove_invalid()
|
|
logger.info("Removed %s invalid status in local database", total)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
cli(obj={})
|