Add json/<package_name> support and bug fixes

This commit is contained in:
taoky 2024-08-06 03:06:21 +08:00
parent 2aa8c9ae97
commit 70a99b8594
3 changed files with 128 additions and 103 deletions

1
.gitignore vendored
View File

@ -6,3 +6,4 @@ plan.json
remote.json remote.json
venv/ venv/
packages/ packages/
json/

View File

@ -71,6 +71,9 @@ If you already have a pypi repo, use `genlocal` first to generate a local db:
./shadowmire.py genlocal ./shadowmire.py genlocal
``` ```
> [!IMPORTANT]
> You shall have file `json/<package_name>` before `genlocal`.
Verify command could be used if you believe that something is wrong (inconsistent). It would: Verify command could be used if you believe that something is wrong (inconsistent). It would:
1. remove packages NOT in local db 1. remove packages NOT in local db

View File

@ -71,7 +71,7 @@ class LocalVersionKV:
def get(self, key: str) -> Optional[int]: def get(self, key: str) -> Optional[int]:
cur = self.conn.cursor() cur = self.conn.cursor()
res = cur.execute("SELECT key, value FROM local WHERE key = ?", (key,)) res = cur.execute("SELECT value FROM local WHERE key = ?", (key,))
row = res.fetchone() row = res.fetchone()
return row[0] if row else None return row[0] if row else None
@ -186,13 +186,14 @@ def get_packages_from_index_html(contents: str) -> list[str]:
def get_existing_hrefs(package_simple_path: Path) -> list[str]: def get_existing_hrefs(package_simple_path: Path) -> list[str]:
"""
There exists packages that have no release files, so when it encounters errors it would return None,
otherwise empty list or list with hrefs.
"""
existing_hrefs = [] existing_hrefs = []
try:
with open(package_simple_path / "index.html") as f: with open(package_simple_path / "index.html") as f:
contents = f.read() contents = f.read()
existing_hrefs = get_packages_from_index_html(contents) existing_hrefs = get_packages_from_index_html(contents)
except FileNotFoundError:
pass
return existing_hrefs return existing_hrefs
@ -237,21 +238,24 @@ class PyPI:
raise PackageNotFoundError raise PackageNotFoundError
return req.json() return req.json()
def get_release_files_from_meta(self, package_meta: dict) -> list[dict]: @staticmethod
def get_release_files_from_meta(package_meta: dict) -> list[dict]:
release_files = [] release_files = []
for release in package_meta["releases"].values(): for release in package_meta["releases"].values():
release_files.extend(release) release_files.extend(release)
release_files.sort(key=lambda x: x["filename"]) release_files.sort(key=lambda x: x["filename"])
return release_files return release_files
def file_url_to_local_url(self, url: str) -> str: @staticmethod
def file_url_to_local_url(url: str) -> str:
parsed = urlparse(url) parsed = urlparse(url)
assert parsed.path.startswith("/packages") assert parsed.path.startswith("/packages")
prefix = "../.." prefix = "../.."
return prefix + parsed.path return prefix + parsed.path
# Func modified from bandersnatch # Func modified from bandersnatch
def generate_html_simple_page(self, package_meta: dict) -> str: @classmethod
def generate_html_simple_page(cls, package_meta: dict) -> str:
package_rawname = package_meta["info"]["name"] package_rawname = package_meta["info"]["name"]
simple_page_content = ( simple_page_content = (
"<!DOCTYPE html>\n" "<!DOCTYPE html>\n"
@ -264,7 +268,7 @@ class PyPI:
" <h1>Links for {1}</h1>\n" " <h1>Links for {1}</h1>\n"
).format("1.0", package_rawname) ).format("1.0", package_rawname)
release_files = self.get_release_files_from_meta(package_meta) release_files = cls.get_release_files_from_meta(package_meta)
def gen_html_file_tags(release: dict) -> str: def gen_html_file_tags(release: dict) -> str:
file_tags = "" file_tags = ""
@ -289,9 +293,9 @@ class PyPI:
simple_page_content += "\n".join( simple_page_content += "\n".join(
[ [
' <a href="{}#{}={}"{}>{}</a><br/>'.format( ' <a href="{}#{}={}"{}>{}</a><br/>'.format(
self.file_url_to_local_url(r["url"]), cls.file_url_to_local_url(r["url"]),
self.digest_name, cls.digest_name,
r["digests"][self.digest_name], r["digests"][cls.digest_name],
gen_html_file_tags(r), gen_html_file_tags(r),
r["filename"], r["filename"],
) )
@ -306,7 +310,8 @@ class PyPI:
return simple_page_content return simple_page_content
# Func modified from bandersnatch # Func modified from bandersnatch
def generate_json_simple_page(self, package_meta: dict) -> str: @classmethod
def generate_json_simple_page(cls, package_meta: dict) -> str:
package_json: dict[str, Any] = { package_json: dict[str, Any] = {
"files": [], "files": [],
"meta": { "meta": {
@ -318,7 +323,7 @@ class PyPI:
"versions": sorted(package_meta["releases"].keys()), "versions": sorted(package_meta["releases"].keys()),
} }
release_files = self.get_release_files_from_meta(package_meta) release_files = cls.get_release_files_from_meta(package_meta)
# Add release files into the JSON dict # Add release files into the JSON dict
for r in release_files: for r in release_files:
@ -326,12 +331,12 @@ class PyPI:
{ {
"filename": r["filename"], "filename": r["filename"],
"hashes": { "hashes": {
self.digest_name: r["digests"][self.digest_name], cls.digest_name: r["digests"][cls.digest_name],
}, },
"requires-python": r.get("requires_python", ""), "requires-python": r.get("requires_python", ""),
"size": r["size"], "size": r["size"],
"upload-time": r.get("upload_time_iso_8601", ""), "upload-time": r.get("upload_time_iso_8601", ""),
"url": self.file_url_to_local_url(r["url"]), "url": cls.file_url_to_local_url(r["url"]),
"yanked": r.get("yanked", False), "yanked": r.get("yanked", False),
} }
) )
@ -366,9 +371,11 @@ class SyncBase:
self.local_db = local_db self.local_db = local_db
self.simple_dir = basedir / "simple" self.simple_dir = basedir / "simple"
self.packages_dir = basedir / "packages" self.packages_dir = basedir / "packages"
self.jsonmeta_dir = basedir / "json"
# create the dirs, if not exist # create the dirs, if not exist
self.simple_dir.mkdir(parents=True, exist_ok=True) self.simple_dir.mkdir(parents=True, exist_ok=True)
self.packages_dir.mkdir(parents=True, exist_ok=True) self.packages_dir.mkdir(parents=True, exist_ok=True)
self.jsonmeta_dir.mkdir(parents=True, exist_ok=True)
self.sync_packages = sync_packages self.sync_packages = sync_packages
def filter_remote_with_excludes( def filter_remote_with_excludes(
@ -421,9 +428,15 @@ class SyncBase:
def check_and_update(self, package_names: list[str]) -> None: def check_and_update(self, package_names: list[str]) -> None:
to_update = [] to_update = []
for package_name in tqdm(package_names, desc="Checking consistency"): for package_name in tqdm(package_names, desc="Checking consistency"):
package_jsonmeta_path = self.basedir / "json" / package_name
if not package_jsonmeta_path.exists():
to_update.append(package_name)
continue
package_simple_path = self.basedir / "simple" / package_name package_simple_path = self.basedir / "simple" / package_name
try:
hrefs = get_existing_hrefs(package_simple_path) hrefs = get_existing_hrefs(package_simple_path)
if not hrefs: except Exception:
# something unexpected happens...
to_update.append(package_name) to_update.append(package_name)
continue continue
# OK, check if all hrefs have corresponding files # OK, check if all hrefs have corresponding files
@ -452,7 +465,9 @@ class SyncBase:
for idx, package_name in enumerate(package_names) for idx, package_name in enumerate(package_names)
} }
try: try:
for future in tqdm(as_completed(futures), total=len(package_names), desc="Updating"): for future in tqdm(
as_completed(futures), total=len(package_names), desc="Updating"
):
idx, package_name = futures[future] idx, package_name = futures[future]
try: try:
serial = future.result() serial = future.result()
@ -483,36 +498,49 @@ class SyncBase:
self.parallel_update(to_update, prerelease_excludes) self.parallel_update(to_update, prerelease_excludes)
def do_remove(self, package_name: str, write_db: bool = True) -> None: def do_remove(self, package_name: str, use_db: bool = True) -> None:
metajson_path = self.jsonmeta_dir / package_name
if metajson_path.exists():
# To make this less noisy...
logger.info("removing %s", package_name) logger.info("removing %s", package_name)
meta_dir = self.simple_dir / package_name package_simple_dir = self.simple_dir / package_name
index_html = meta_dir / "index.html" index_html = package_simple_dir / "index.html"
try: if index_html.exists():
with open(index_html) as f: with open(index_html) as f:
packages_to_remove = get_packages_from_index_html(f.read()) packages_to_remove = get_packages_from_index_html(f.read())
for p in packages_to_remove: paths_to_remove = [package_simple_dir / p for p in packages_to_remove]
p_path = meta_dir / p for p in paths_to_remove:
try: if p.exists():
p_path.unlink() p.unlink()
logger.info("Removed file %s", p_path) logger.info("Removed file %s", p)
except FileNotFoundError: remove_dir_with_files(package_simple_dir)
pass metajson_path = self.jsonmeta_dir / package_name
# remove all files inside meta_dir metajson_path.unlink(missing_ok=True)
if write_db: if use_db:
old_serial = self.local_db.get(package_name)
if old_serial != -1:
self.local_db.remove(package_name) self.local_db.remove(package_name)
remove_dir_with_files(meta_dir)
except FileNotFoundError:
logger.warning("FileNotFoundError when removing %s", package_name)
pass
def do_update( def do_update(
self, self,
package_name: str, package_name: str,
prerelease_excludes: list[re.Pattern[str]], prerelease_excludes: list[re.Pattern[str]],
write_db: bool = True, use_db: bool = True,
) -> Optional[int]: ) -> Optional[int]:
raise NotImplementedError raise NotImplementedError
def write_meta_to_simple(self, package_simple_path: Path, meta: dict) -> None:
simple_html_contents = PyPI.generate_html_simple_page(meta)
simple_json_contents = PyPI.generate_json_simple_page(meta)
for html_filename in ("index.html", "index.v1_html"):
html_path = package_simple_path / html_filename
with overwrite(html_path) as f:
f.write(simple_html_contents)
for json_filename in ("index.v1_json",):
json_path = package_simple_path / json_filename
with overwrite(json_path) as f:
f.write(simple_json_contents)
def finalize(self) -> None: def finalize(self) -> None:
local_names = self.local_db.keys() local_names = self.local_db.keys()
# generate index.html at basedir # generate index.html at basedir
@ -535,22 +563,24 @@ class SyncBase:
self.local_db.dump_json() self.local_db.dump_json()
def download(session: requests.Session, url: str, dest: Path) -> tuple[bool, int]: def download(
session: requests.Session, url: str, dest: Path
) -> tuple[bool, Optional[requests.Response]]:
try: try:
resp = session.get(url, allow_redirects=True) resp = session.get(url, allow_redirects=True)
except requests.RequestException: except requests.RequestException:
logger.warning("download %s failed with exception", exc_info=True) logger.warning("download %s failed with exception", exc_info=True)
return False, -1 return False, None
if resp.status_code >= 400: if resp.status_code >= 400:
logger.warning( logger.warning(
"download %s failed with status %s, skipping this package", "download %s failed with status %s, skipping this package",
url, url,
resp.status_code, resp.status_code,
) )
return False, resp.status_code return False, resp
with overwrite(dest, "wb") as f: with overwrite(dest, "wb") as f:
f.write(resp.content) f.write(resp.content)
return True, resp.status_code return True, resp
class SyncPyPI(SyncBase): class SyncPyPI(SyncBase):
@ -572,7 +602,7 @@ class SyncPyPI(SyncBase):
self, self,
package_name: str, package_name: str,
prerelease_excludes: list[re.Pattern[str]], prerelease_excludes: list[re.Pattern[str]],
write_db: bool = True, use_db: bool = True,
) -> Optional[int]: ) -> Optional[int]:
logger.info("updating %s", package_name) logger.info("updating %s", package_name)
package_simple_path = self.simple_dir / package_name package_simple_path = self.simple_dir / package_name
@ -582,11 +612,12 @@ class SyncPyPI(SyncBase):
logger.debug("%s meta: %s", package_name, meta) logger.debug("%s meta: %s", package_name, meta)
except PackageNotFoundError: except PackageNotFoundError:
logger.warning( logger.warning(
"%s missing from upstream, skip and ignore in the future.", package_name "%s missing from upstream, remove and ignore in the future.",
package_name,
) )
# try remove it locally, if it does not exist upstream # try remove it locally, if it does not exist upstream
self.do_remove(package_name, write_db=False) self.do_remove(package_name, use_db=False)
if not write_db: if not use_db:
return -1 return -1
self.local_db.set(package_name, -1) self.local_db.set(package_name, -1)
return None return None
@ -600,7 +631,7 @@ class SyncPyPI(SyncBase):
if self.sync_packages: if self.sync_packages:
# sync packages first, then sync index # sync packages first, then sync index
existing_hrefs = get_existing_hrefs(package_simple_path) existing_hrefs = get_existing_hrefs(package_simple_path)
release_files = self.pypi.get_release_files_from_meta(meta) release_files = PyPI.get_release_files_from_meta(meta)
# remove packages that no longer exist remotely # remove packages that no longer exist remotely
remote_hrefs = [ remote_hrefs = [
self.pypi.file_url_to_local_url(i["url"]) for i in release_files self.pypi.file_url_to_local_url(i["url"]) for i in release_files
@ -619,25 +650,19 @@ class SyncPyPI(SyncBase):
if dest.exists(): if dest.exists():
continue continue
dest.parent.mkdir(parents=True, exist_ok=True) dest.parent.mkdir(parents=True, exist_ok=True)
success, code = download(self.session, url, dest) success, resp = download(self.session, url, dest)
if not success: if not success:
logger.warning("skipping %s as it fails downloading", package_name) logger.warning("skipping %s as it fails downloading", package_name)
return None return None
last_serial: int = meta["last_serial"] last_serial: int = meta["last_serial"]
simple_html_contents = self.pypi.generate_html_simple_page(meta)
simple_json_contents = self.pypi.generate_json_simple_page(meta)
for html_filename in ("index.html", "index.v1_html"): self.write_meta_to_simple(package_simple_path, meta)
html_path = package_simple_path / html_filename json_meta_path = self.jsonmeta_dir / package_name
with overwrite(html_path) as f: with overwrite(json_meta_path) as f:
f.write(simple_html_contents) json.dump(meta, f)
for json_filename in ("index.v1_json",):
json_path = package_simple_path / json_filename
with overwrite(json_path) as f:
f.write(simple_json_contents)
if write_db: if use_db:
self.local_db.set(package_name, last_serial) self.local_db.set(package_name, last_serial)
return last_serial return last_serial
@ -666,7 +691,7 @@ class SyncPlainHTTP(SyncBase):
self, self,
package_name: str, package_name: str,
prerelease_excludes: list[re.Pattern[str]], prerelease_excludes: list[re.Pattern[str]],
write_db: bool = True, use_db: bool = True,
) -> Optional[int]: ) -> Optional[int]:
if prerelease_excludes: if prerelease_excludes:
logger.warning( logger.warning(
@ -677,68 +702,65 @@ class SyncPlainHTTP(SyncBase):
package_simple_path.mkdir(exist_ok=True) package_simple_path.mkdir(exist_ok=True)
if self.sync_packages: if self.sync_packages:
existing_hrefs = get_existing_hrefs(package_simple_path) existing_hrefs = get_existing_hrefs(package_simple_path)
# directly fetch remote files # Download JSON meta
for filename in ("index.html", "index.v1_html", "index.v1_json"): file_url = urljoin(self.upstream, f"/json/{package_name}")
file_url = urljoin(self.upstream, f"/simple/{package_name}/{filename}") success, resp = download(
# Don't overwrite existing index first! self.session, file_url, self.jsonmeta_dir / (package_name + ".new")
success, code = download(
self.session, file_url, package_simple_path / (filename + ".new")
) )
if not success: if not success:
if filename != "index.html":
logger.warning("index file %s fails", file_url)
continue
else:
logger.error( logger.error(
"critical index file %s fails. Stop with this.", file_url "download %s JSON meta fails with code %s",
package_name,
resp.status_code if resp else None,
) )
if code == 404:
self.do_remove(package_name, write_db=False)
# We don't return -1 here, as shadowmire upstream would fix this inconsistency next time syncing.
return None return None
assert resp
meta = resp.json()
if self.sync_packages: if self.sync_packages:
current_hrefs = get_existing_hrefs(package_simple_path) release_files = PyPI.get_release_files_from_meta(meta)
should_remove = list(set(existing_hrefs) - set(current_hrefs)) remote_hrefs = [PyPI.file_url_to_local_url(i["url"]) for i in release_files]
should_remove = list(set(existing_hrefs) - set(remote_hrefs))
for p in should_remove: for p in should_remove:
logger.info("removing file %s (if exists)", p) logger.info("removing file %s (if exists)", p)
package_path = (package_simple_path / p).resolve() package_path = (package_simple_path / p).resolve()
package_path.unlink(missing_ok=True) package_path.unlink(missing_ok=True)
package_simple_url = urljoin(self.upstream, f"/simple/{package_name}/") package_simple_url = urljoin(self.upstream, f"/simple/{package_name}/")
for href in current_hrefs: for href in remote_hrefs:
url = urljoin(package_simple_url, href) url = urljoin(package_simple_url, href)
dest = (package_simple_path / href).resolve() dest = (package_simple_path / href).resolve()
logger.info("downloading file %s -> %s", url, dest) logger.info("downloading file %s -> %s", url, dest)
if dest.exists(): if dest.exists():
continue continue
dest.parent.mkdir(parents=True, exist_ok=True) dest.parent.mkdir(parents=True, exist_ok=True)
success, code = download(self.session, url, dest) success, resp = download(self.session, url, dest)
if not success: if not success:
logger.warning("skipping %s as it fails downloading", package_name) logger.warning("skipping %s as it fails downloading", package_name)
return None return None
# OK, now it's safe to rename # OK, now it's safe to rename
for filename in ("index.html", "index.v1_html", "index.v1_json"): (self.jsonmeta_dir / (package_name + ".new")).rename(
file_from = package_simple_path / (filename + ".new") self.jsonmeta_dir / package_name
file_to = package_simple_path / filename )
try: # generate indexes
file_from.rename(file_to) self.write_meta_to_simple(package_simple_path, meta)
except FileNotFoundError:
pass
last_serial = get_local_serial(package_simple_path) last_serial = get_local_serial(package_simple_path)
if not last_serial: if not last_serial:
logger.warning("cannot get valid package serial from %s", package_name) logger.warning("cannot get valid package serial from %s", package_name)
else: else:
if write_db: if use_db:
self.local_db.set(package_name, last_serial) self.local_db.set(package_name, last_serial)
return last_serial return last_serial
def get_local_serial(package_simple_path: Path) -> Optional[int]: def get_local_serial(package_meta_path: Path) -> Optional[int]:
package_name = package_simple_path.name """
package_index_path = package_simple_path / "index.html" Accepts /json/<package_name> as package_meta_path
"""
package_name = package_meta_path.name
package_index_path = package_meta_path / "index.html"
try: try:
with open(package_index_path) as f: with open(package_index_path) as f:
contents = f.read() contents = f.read()
@ -746,11 +768,10 @@ def get_local_serial(package_simple_path: Path) -> Optional[int]:
logger.warning("%s does not have index.html, skipping", package_name) logger.warning("%s does not have index.html, skipping", package_name)
return None return None
try: try:
serial_comment = contents.splitlines()[-1].strip() meta = json.loads(contents)
serial = int(serial_comment.removeprefix("<!--SERIAL ").removesuffix("-->")) return meta["last_serial"]
return serial
except Exception: except Exception:
logger.warning("cannot parse %s index.html", package_name, exc_info=True) logger.warning("cannot parse %s's JSON metadata", package_name, exc_info=True)
return None return None
@ -850,15 +871,15 @@ def sync(
syncer.finalize() syncer.finalize()
@cli.command(help="(Re)generate local db and json from simple/") @cli.command(help="(Re)generate local db and json from json/")
@click.pass_context @click.pass_context
def genlocal(ctx: click.Context) -> None: def genlocal(ctx: click.Context) -> None:
basedir: Path = ctx.obj["basedir"] basedir: Path = ctx.obj["basedir"]
local_db: LocalVersionKV = ctx.obj["local_db"] local_db: LocalVersionKV = ctx.obj["local_db"]
local = {} local = {}
for package_path in (basedir / "simple").iterdir(): for package_metapath in (basedir / "json").iterdir():
package_name = package_path.name package_name = package_metapath.name
serial = get_local_serial(package_path) serial = get_local_serial(package_metapath)
if serial: if serial:
local[package_name] = serial local[package_name] = serial
local_db.nuke(commit=False) local_db.nuke(commit=False)