Use unquote to handle encoded path

In previous versions shadowmire fails to handle filename like "logic gates.tar", instead it would download as "logic%20gates.tar".

Existing instances using shadowmire might need to re-verify.
This commit is contained in:
taoky 2024-09-04 17:05:58 +08:00
parent 99bd4d932e
commit 47529107ae
2 changed files with 30 additions and 10 deletions

View File

@ -7,7 +7,7 @@ import xmlrpc.client
from dataclasses import dataclass
import re
import json
from urllib.parse import urljoin, urlparse, urlunparse
from urllib.parse import urljoin, urlparse, urlunparse, unquote
from pathlib import Path
from html.parser import HTMLParser
import logging
@ -340,11 +340,25 @@ class PyPI:
@staticmethod
def file_url_to_local_url(url: str) -> str:
"""
This function should NOT be used to construct a local Path!
"""
parsed = urlparse(url)
assert parsed.path.startswith("/packages")
prefix = "../.."
return prefix + parsed.path
@staticmethod
def file_url_to_local_path(url: str) -> Path:
"""
Unquote() and returns a Path
"""
path = urlparse(url).path
path = unquote(path)
assert path.startswith("/packages")
path = path[1:]
return Path("../..") / path
# Func modified from bandersnatch
@classmethod
def generate_html_simple_page(cls, package_meta: dict) -> str:
@ -574,13 +588,14 @@ class SyncBase:
# OK, check if all hrefs have corresponding files
if self.sync_packages:
for href, size in hrefsize_json:
dest_pathstr = normpath(package_simple_path / href)
relative_path = unquote(href)
dest_pathstr = normpath(package_simple_path / relative_path)
try:
# Fast shortcut to avoid stat() it
if dest_pathstr not in packages_pathcache:
raise FileNotFoundError
if compare_size and size != -1:
dest = Path(normpath(package_simple_path / href))
dest = Path(dest_pathstr)
# So, do stat() for real only when we need to do so,
# have a size, and it really exists in pathcache.
dest_stat = dest.stat()
@ -849,7 +864,8 @@ class SyncPyPI(SyncBase):
self.pypi.file_url_to_local_url(i["url"]) for i in release_files
]
should_remove = list(set(existing_hrefs) - set(remote_hrefs))
for p in should_remove:
for href in should_remove:
p = unquote(href)
logger.info("removing file %s (if exists)", p)
package_path = Path(normpath(package_simple_path / p))
package_path.unlink(missing_ok=True)
@ -857,7 +873,7 @@ class SyncPyPI(SyncBase):
url = i["url"]
dest = Path(
normpath(
package_simple_path / self.pypi.file_url_to_local_url(i["url"])
package_simple_path / self.pypi.file_url_to_local_path(i["url"])
)
)
logger.info("downloading file %s -> %s", url, dest)
@ -951,15 +967,17 @@ class SyncPlainHTTP(SyncBase):
release_files = PyPI.get_release_files_from_meta(meta)
remote_hrefs = [PyPI.file_url_to_local_url(i["url"]) for i in release_files]
should_remove = list(set(existing_hrefs) - set(remote_hrefs))
for p in should_remove:
for href in should_remove:
p = unquote(href)
logger.info("removing file %s (if exists)", p)
package_path = Path(normpath(package_simple_path / p))
package_path.unlink(missing_ok=True)
package_simple_url = urljoin(self.upstream, f"simple/{package_name}/")
for i in release_files:
href = PyPI.file_url_to_local_url(i["url"])
path = PyPI.file_url_to_local_path(i["url"])
url = urljoin(package_simple_url, href)
dest = Path(normpath(package_simple_path / href))
dest = Path(normpath(package_simple_path / path))
logger.info("downloading file %s -> %s", url, dest)
if self.skip_this_package(i, dest):
continue
@ -1313,12 +1331,12 @@ def verify(
hrefs = get_existing_hrefs(sd)
hrefs = [] if hrefs is None else hrefs
nps = []
for i in hrefs:
for href in hrefs:
i = unquote(href)
# use normpath, which is much faster than pathlib resolve(), as it does not need to access fs
# we could make sure no symlinks could affect this here
np = normpath(sd / i)
logger.debug("add to ref_set: %s", np)
# ref_set.add(np)
nps.append(np)
return nps

View File

@ -3,6 +3,7 @@
# It requires a full simple/ and db (genlocal-ed)
# Call like: python -m utils.create_package_stubs /path/to/pypi/
from urllib.parse import unquote
from shadowmire import LocalVersionKV, get_package_urls_size_from_index_json
from pathlib import Path
import sys
@ -28,7 +29,8 @@ if __name__ == "__main__":
json_simple = package_simple_path / "index.v1_json"
hrefsize_json = get_package_urls_size_from_index_json(json_simple)
for href, _ in hrefsize_json:
dest = Path(normpath(package_simple_path / href))
relative = unquote(href)
dest = Path(normpath(package_simple_path / relative))
dest.parent.mkdir(parents=True, exist_ok=True)
if not dest.exists():
dest.touch()