Use unquote to handle encoded path

In previous versions shadowmire fails to handle filename like "logic gates.tar", instead it would download as "logic%20gates.tar".

Existing instances using shadowmire might need to re-verify.
This commit is contained in:
taoky 2024-09-04 17:05:58 +08:00
parent 99bd4d932e
commit 47529107ae
2 changed files with 30 additions and 10 deletions

View File

@ -7,7 +7,7 @@ import xmlrpc.client
from dataclasses import dataclass from dataclasses import dataclass
import re import re
import json import json
from urllib.parse import urljoin, urlparse, urlunparse from urllib.parse import urljoin, urlparse, urlunparse, unquote
from pathlib import Path from pathlib import Path
from html.parser import HTMLParser from html.parser import HTMLParser
import logging import logging
@ -340,11 +340,25 @@ class PyPI:
@staticmethod @staticmethod
def file_url_to_local_url(url: str) -> str: def file_url_to_local_url(url: str) -> str:
"""
This function should NOT be used to construct a local Path!
"""
parsed = urlparse(url) parsed = urlparse(url)
assert parsed.path.startswith("/packages") assert parsed.path.startswith("/packages")
prefix = "../.." prefix = "../.."
return prefix + parsed.path return prefix + parsed.path
@staticmethod
def file_url_to_local_path(url: str) -> Path:
"""
Unquote() and returns a Path
"""
path = urlparse(url).path
path = unquote(path)
assert path.startswith("/packages")
path = path[1:]
return Path("../..") / path
# Func modified from bandersnatch # Func modified from bandersnatch
@classmethod @classmethod
def generate_html_simple_page(cls, package_meta: dict) -> str: def generate_html_simple_page(cls, package_meta: dict) -> str:
@ -574,13 +588,14 @@ class SyncBase:
# OK, check if all hrefs have corresponding files # OK, check if all hrefs have corresponding files
if self.sync_packages: if self.sync_packages:
for href, size in hrefsize_json: for href, size in hrefsize_json:
dest_pathstr = normpath(package_simple_path / href) relative_path = unquote(href)
dest_pathstr = normpath(package_simple_path / relative_path)
try: try:
# Fast shortcut to avoid stat() it # Fast shortcut to avoid stat() it
if dest_pathstr not in packages_pathcache: if dest_pathstr not in packages_pathcache:
raise FileNotFoundError raise FileNotFoundError
if compare_size and size != -1: if compare_size and size != -1:
dest = Path(normpath(package_simple_path / href)) dest = Path(dest_pathstr)
# So, do stat() for real only when we need to do so, # So, do stat() for real only when we need to do so,
# have a size, and it really exists in pathcache. # have a size, and it really exists in pathcache.
dest_stat = dest.stat() dest_stat = dest.stat()
@ -849,7 +864,8 @@ class SyncPyPI(SyncBase):
self.pypi.file_url_to_local_url(i["url"]) for i in release_files self.pypi.file_url_to_local_url(i["url"]) for i in release_files
] ]
should_remove = list(set(existing_hrefs) - set(remote_hrefs)) should_remove = list(set(existing_hrefs) - set(remote_hrefs))
for p in should_remove: for href in should_remove:
p = unquote(href)
logger.info("removing file %s (if exists)", p) logger.info("removing file %s (if exists)", p)
package_path = Path(normpath(package_simple_path / p)) package_path = Path(normpath(package_simple_path / p))
package_path.unlink(missing_ok=True) package_path.unlink(missing_ok=True)
@ -857,7 +873,7 @@ class SyncPyPI(SyncBase):
url = i["url"] url = i["url"]
dest = Path( dest = Path(
normpath( normpath(
package_simple_path / self.pypi.file_url_to_local_url(i["url"]) package_simple_path / self.pypi.file_url_to_local_path(i["url"])
) )
) )
logger.info("downloading file %s -> %s", url, dest) logger.info("downloading file %s -> %s", url, dest)
@ -951,15 +967,17 @@ class SyncPlainHTTP(SyncBase):
release_files = PyPI.get_release_files_from_meta(meta) release_files = PyPI.get_release_files_from_meta(meta)
remote_hrefs = [PyPI.file_url_to_local_url(i["url"]) for i in release_files] remote_hrefs = [PyPI.file_url_to_local_url(i["url"]) for i in release_files]
should_remove = list(set(existing_hrefs) - set(remote_hrefs)) should_remove = list(set(existing_hrefs) - set(remote_hrefs))
for p in should_remove: for href in should_remove:
p = unquote(href)
logger.info("removing file %s (if exists)", p) logger.info("removing file %s (if exists)", p)
package_path = Path(normpath(package_simple_path / p)) package_path = Path(normpath(package_simple_path / p))
package_path.unlink(missing_ok=True) package_path.unlink(missing_ok=True)
package_simple_url = urljoin(self.upstream, f"simple/{package_name}/") package_simple_url = urljoin(self.upstream, f"simple/{package_name}/")
for i in release_files: for i in release_files:
href = PyPI.file_url_to_local_url(i["url"]) href = PyPI.file_url_to_local_url(i["url"])
path = PyPI.file_url_to_local_path(i["url"])
url = urljoin(package_simple_url, href) url = urljoin(package_simple_url, href)
dest = Path(normpath(package_simple_path / href)) dest = Path(normpath(package_simple_path / path))
logger.info("downloading file %s -> %s", url, dest) logger.info("downloading file %s -> %s", url, dest)
if self.skip_this_package(i, dest): if self.skip_this_package(i, dest):
continue continue
@ -1313,12 +1331,12 @@ def verify(
hrefs = get_existing_hrefs(sd) hrefs = get_existing_hrefs(sd)
hrefs = [] if hrefs is None else hrefs hrefs = [] if hrefs is None else hrefs
nps = [] nps = []
for i in hrefs: for href in hrefs:
i = unquote(href)
# use normpath, which is much faster than pathlib resolve(), as it does not need to access fs # use normpath, which is much faster than pathlib resolve(), as it does not need to access fs
# we could make sure no symlinks could affect this here # we could make sure no symlinks could affect this here
np = normpath(sd / i) np = normpath(sd / i)
logger.debug("add to ref_set: %s", np) logger.debug("add to ref_set: %s", np)
# ref_set.add(np)
nps.append(np) nps.append(np)
return nps return nps

View File

@ -3,6 +3,7 @@
# It requires a full simple/ and db (genlocal-ed) # It requires a full simple/ and db (genlocal-ed)
# Call like: python -m utils.create_package_stubs /path/to/pypi/ # Call like: python -m utils.create_package_stubs /path/to/pypi/
from urllib.parse import unquote
from shadowmire import LocalVersionKV, get_package_urls_size_from_index_json from shadowmire import LocalVersionKV, get_package_urls_size_from_index_json
from pathlib import Path from pathlib import Path
import sys import sys
@ -28,7 +29,8 @@ if __name__ == "__main__":
json_simple = package_simple_path / "index.v1_json" json_simple = package_simple_path / "index.v1_json"
hrefsize_json = get_package_urls_size_from_index_json(json_simple) hrefsize_json = get_package_urls_size_from_index_json(json_simple)
for href, _ in hrefsize_json: for href, _ in hrefsize_json:
dest = Path(normpath(package_simple_path / href)) relative = unquote(href)
dest = Path(normpath(package_simple_path / relative))
dest.parent.mkdir(parents=True, exist_ok=True) dest.parent.mkdir(parents=True, exist_ok=True)
if not dest.exists(): if not dest.exists():
dest.touch() dest.touch()