SyncPlainHTTP Packages downloading support

This commit is contained in:
taoky 2024-08-02 20:20:25 +08:00
parent a4574ecaeb
commit 52ceabcdd4

View File

@ -163,6 +163,17 @@ def get_packages_from_index_html(contents: str) -> list[str]:
return ret return ret
def get_existing_hrefs(package_simple_path: Path) -> list[str]:
existing_hrefs = []
try:
with open(package_simple_path / "index.html") as f:
contents = f.read()
existing_hrefs = get_packages_from_index_html(contents)
except FileNotFoundError:
pass
return existing_hrefs
class CustomXMLRPCTransport(xmlrpc.client.Transport): class CustomXMLRPCTransport(xmlrpc.client.Transport):
""" """
Set user-agent for xmlrpc.client Set user-agent for xmlrpc.client
@ -208,7 +219,7 @@ class PyPI:
release_files.sort(key=lambda x: x["filename"]) release_files.sort(key=lambda x: x["filename"])
return release_files return release_files
def _file_url_to_local_url(self, url: str) -> str: def file_url_to_local_url(self, url: str) -> str:
parsed = urlparse(url) parsed = urlparse(url)
assert parsed.path.startswith("/packages") assert parsed.path.startswith("/packages")
prefix = "../.." prefix = "../.."
@ -253,7 +264,7 @@ class PyPI:
simple_page_content += "\n".join( simple_page_content += "\n".join(
[ [
' <a href="{}#{}={}"{}>{}</a><br/>'.format( ' <a href="{}#{}={}"{}>{}</a><br/>'.format(
self._file_url_to_local_url(r["url"]), self.file_url_to_local_url(r["url"]),
self.digest_name, self.digest_name,
r["digests"][self.digest_name], r["digests"][self.digest_name],
gen_html_file_tags(r), gen_html_file_tags(r),
@ -295,7 +306,7 @@ class PyPI:
"requires-python": r.get("requires_python", ""), "requires-python": r.get("requires_python", ""),
"size": r["size"], "size": r["size"],
"upload-time": r.get("upload_time_iso_8601", ""), "upload-time": r.get("upload_time_iso_8601", ""),
"url": self._file_url_to_local_url(r["url"]), "url": self.file_url_to_local_url(r["url"]),
"yanked": r.get("yanked", False), "yanked": r.get("yanked", False),
} }
) )
@ -438,6 +449,16 @@ class SyncBase:
f.write(" </body>\n</html>") f.write(" </body>\n</html>")
def download(session: requests.Session, url: str, dest: Path) -> bool:
resp = session.get(url, allow_redirects=True)
if resp.status_code >= 400:
logger.warning("download %s failed, skipping this package", url)
return False
with overwrite(dest, "wb") as f:
f.write(resp.content)
return True
class SyncPyPI(SyncBase): class SyncPyPI(SyncBase):
def __init__( def __init__(
self, basedir: Path, local_db: LocalVersionKV, sync_packages: bool = False self, basedir: Path, local_db: LocalVersionKV, sync_packages: bool = False
@ -466,16 +487,12 @@ class SyncPyPI(SyncBase):
if self.sync_packages: if self.sync_packages:
# sync packages first, then sync index # sync packages first, then sync index
existing_hrefs = [] existing_hrefs = get_existing_hrefs(package_simple_path)
try:
with open(package_simple_path / "index.html") as f:
contents = f.read()
existing_hrefs = get_packages_from_index_html(contents)
except FileNotFoundError:
pass
release_files = self.pypi.get_release_files_from_meta(meta) release_files = self.pypi.get_release_files_from_meta(meta)
# remove packages that no longer exist remotely # remove packages that no longer exist remotely
remote_hrefs = [self.pypi._file_url_to_local_url(i["url"]) for i in release_files] remote_hrefs = [
self.pypi.file_url_to_local_url(i["url"]) for i in release_files
]
should_remove = list(set(existing_hrefs) - set(remote_hrefs)) should_remove = list(set(existing_hrefs) - set(remote_hrefs))
for p in should_remove: for p in should_remove:
logger.info("removing file %s (if exists)", p) logger.info("removing file %s (if exists)", p)
@ -483,17 +500,14 @@ class SyncPyPI(SyncBase):
package_path.unlink(missing_ok=True) package_path.unlink(missing_ok=True)
for i in release_files: for i in release_files:
url = i["url"] url = i["url"]
dest = (package_simple_path / self.pypi._file_url_to_local_url(i["url"])).resolve() dest = (
package_simple_path / self.pypi.file_url_to_local_url(i["url"])
).resolve()
logger.info("downloading file %s -> %s", url, dest) logger.info("downloading file %s -> %s", url, dest)
if dest.exists(): if dest.exists():
continue continue
dest.parent.mkdir(parents=True, exist_ok=True) dest.parent.mkdir(parents=True, exist_ok=True)
resp = self.session.get(url) download(self.session, url, dest)
if resp.status_code >= 400:
logger.warning("download %s failed, skipping this package", url)
return None
with overwrite(dest, "wb") as f:
f.write(resp.content)
last_serial: int = meta["last_serial"] last_serial: int = meta["last_serial"]
simple_html_contents = self.pypi.generate_html_simple_page(meta) simple_html_contents = self.pypi.generate_html_simple_page(meta)
@ -537,25 +551,38 @@ class SyncPlainHTTP(SyncBase):
logger.info("updating %s", package_name) logger.info("updating %s", package_name)
package_simple_path = self.simple_dir / package_name package_simple_path = self.simple_dir / package_name
package_simple_path.mkdir(exist_ok=True) package_simple_path.mkdir(exist_ok=True)
if self.sync_packages:
existing_hrefs = get_existing_hrefs(package_simple_path)
# directly fetch remote files # directly fetch remote files
for filename in ("index.html", "index.v1_html", "index.v1_json"): for filename in ("index.html", "index.v1_html", "index.v1_json"):
file_url = urljoin(self.upstream, f"/simple/{package_name}/{filename}") file_url = urljoin(self.upstream, f"/simple/{package_name}/{filename}")
resp = self.session.get(file_url) success = download(self.session, file_url, package_simple_path / filename)
if resp.status_code == 404: if not success:
if filename != "index.html": if filename != "index.html":
logger.warning("%s does not exist", file_url) logger.warning("%s does not exist", file_url)
continue continue
else: else:
logger.error("%s does not exist. Stop with this.", file_url) logger.error("%s does not exist. Stop with this.", file_url)
return None return None
else:
resp.raise_for_status()
content = resp.content
with open(package_simple_path / filename, "wb") as f:
f.write(content)
if self.sync_packages: if self.sync_packages:
raise NotImplementedError current_hrefs = get_existing_hrefs(package_simple_path)
should_remove = list(set(existing_hrefs) - set(current_hrefs))
for p in should_remove:
logger.info("removing file %s (if exists)", p)
package_path = (package_simple_path / p).resolve()
package_path.unlink(missing_ok=True)
package_simple_url = urljoin(self.upstream, f"/simple/{package_name}/")
for href in current_hrefs:
url = urljoin(package_simple_url, href)
dest = (
package_simple_path / href
).resolve()
logger.info("downloading file %s -> %s", url, dest)
if dest.exists():
continue
dest.parent.mkdir(parents=True, exist_ok=True)
download(self.session, url, dest)
last_serial = get_local_serial(package_simple_path) last_serial = get_local_serial(package_simple_path)
if not last_serial: if not last_serial:
@ -592,7 +619,9 @@ def main(args: argparse.Namespace) -> None:
local_db = LocalVersionKV(basedir / "local.db", basedir / "local.json") local_db = LocalVersionKV(basedir / "local.db", basedir / "local.json")
if args.command == "sync": if args.command == "sync":
sync = SyncPyPI(basedir=basedir, local_db=local_db, sync_packages=args.sync_packages) sync = SyncPyPI(
basedir=basedir, local_db=local_db, sync_packages=args.sync_packages
)
local = local_db.dump() local = local_db.dump()
plan = sync.determine_sync_plan(local) plan = sync.determine_sync_plan(local)
# save plan for debugging # save plan for debugging
@ -611,7 +640,9 @@ def main(args: argparse.Namespace) -> None:
local_db.batch_set(local) local_db.batch_set(local)
local_db.dump_json() local_db.dump_json()
elif args.command == "verify": elif args.command == "verify":
sync = SyncPyPI(basedir=basedir, local_db=local_db) sync = SyncPyPI(
basedir=basedir, local_db=local_db, sync_packages=args.sync_packages
)
local_names = set(local_db.keys()) local_names = set(local_db.keys())
simple_dirs = set([i.name for i in (basedir / "simple").iterdir()]) simple_dirs = set([i.name for i in (basedir / "simple").iterdir()])
for package_name in simple_dirs - local_names: for package_name in simple_dirs - local_names:
@ -625,7 +656,11 @@ if __name__ == "__main__":
subparsers = parser.add_subparsers(dest="command") subparsers = parser.add_subparsers(dest="command")
parser_sync = subparsers.add_parser("sync", help="Sync from upstream") parser_sync = subparsers.add_parser("sync", help="Sync from upstream")
parser_sync.add_argument("--sync-packages", help="Sync packages instead of just indexes", action='store_true') parser_sync.add_argument(
"--sync-packages",
help="Sync packages instead of just indexes",
action="store_true",
)
parser_genlocal = subparsers.add_parser( parser_genlocal = subparsers.add_parser(
"genlocal", help="(Re)generate local db and json from simple/" "genlocal", help="(Re)generate local db and json from simple/"
) )
@ -633,6 +668,11 @@ if __name__ == "__main__":
"verify", "verify",
help="Verify existing sync from local db, download missing things, remove unreferenced packages", help="Verify existing sync from local db, download missing things, remove unreferenced packages",
) )
parser_verify.add_argument(
"--sync-packages",
help="Sync packages instead of just indexes",
action="store_true",
)
args = parser.parse_args() args = parser.parse_args()
if args.command is None: if args.command is None: