Update check_and_update() to check both json and html

This commit is contained in:
taoky 2024-08-06 18:00:27 +08:00
parent c8a11d3d1f
commit b97754ce34
2 changed files with 50 additions and 23 deletions

1
.gitignore vendored
View File

@ -7,3 +7,4 @@ remote.json
venv/ venv/
packages/ packages/
json/ json/
config.toml

View File

@ -156,7 +156,7 @@ def remove_dir_with_files(directory: Path) -> None:
logger.info("Removed dir %s", directory) logger.info("Removed dir %s", directory)
def get_packages_from_index_html(contents: str) -> list[str]: def get_packages_from_index_html(html_path: Path) -> list[str]:
""" """
Get all <a> href (fragments removed) from given simple/<package>/index.html contents Get all <a> href (fragments removed) from given simple/<package>/index.html contents
""" """
@ -175,7 +175,8 @@ def get_packages_from_index_html(contents: str) -> list[str]:
self.hrefs.append(attr[1]) self.hrefs.append(attr[1])
p = ATagHTMLParser() p = ATagHTMLParser()
p.feed(contents) with open(html_path) as f:
p.feed(f.read())
ret = [] ret = []
for href in p.hrefs: for href in p.hrefs:
@ -186,19 +187,32 @@ def get_packages_from_index_html(contents: str) -> list[str]:
return ret return ret
def get_packages_from_index_json(json_path: Path) -> list[str]:
"""
Get all urls from given simple/<package>/index.v1_json contents
"""
with open(json_path) as f:
contents_dict = json.load(f)
urls = [i["url"] for i in contents_dict["files"]]
return urls
def get_existing_hrefs(package_simple_path: Path) -> Optional[list[str]]: def get_existing_hrefs(package_simple_path: Path) -> Optional[list[str]]:
""" """
There exists packages that have no release files, so when it encounters errors it would return None, There exists packages that have no release files, so when it encounters errors it would return None,
otherwise empty list or list with hrefs. otherwise empty list or list with hrefs.
Priority: index.v1_json -> index.html
""" """
existing_hrefs = [] if not package_simple_path.exists():
try: return None
with open(package_simple_path / "index.html") as f: json_file = package_simple_path / "index.v1_json"
contents = f.read() html_file = package_simple_path / "index.html"
existing_hrefs = get_packages_from_index_html(contents) if json_file.exists():
except FileNotFoundError: return get_packages_from_index_json(json_file)
if html_file.exists():
return get_packages_from_index_html(html_file)
return None return None
return existing_hrefs
class CustomXMLRPCTransport(xmlrpc.client.Transport): class CustomXMLRPCTransport(xmlrpc.client.Transport):
@ -434,14 +448,27 @@ class SyncBase:
for package_name in tqdm(package_names, desc="Checking consistency"): for package_name in tqdm(package_names, desc="Checking consistency"):
package_jsonmeta_path = self.jsonmeta_dir / package_name package_jsonmeta_path = self.jsonmeta_dir / package_name
if not package_jsonmeta_path.exists(): if not package_jsonmeta_path.exists():
logger.info("add %s as it does not have json API file", package_name)
to_update.append(package_name) to_update.append(package_name)
continue continue
package_simple_path = self.simple_dir / package_name package_simple_path = self.simple_dir / package_name
hrefs = get_existing_hrefs(package_simple_path) html_simple = package_simple_path / "index.html"
if hrefs is None: json_simple = package_simple_path / "index.v1_json"
# something unexpected happens... if not (html_simple.exists() and json_simple.exists()):
logger.info(
"add %s as it does not have index.html or index.v1_json",
package_name,
)
to_update.append(package_name) to_update.append(package_name)
continue continue
hrefs1 = get_packages_from_index_html(html_simple)
hrefs2 = get_packages_from_index_json(json_simple)
if hrefs1 is None or hrefs2 is None or hrefs1 != hrefs2:
# something unexpected happens...
logger.info("add %s as its indexes are not consistent", package_name)
to_update.append(package_name)
continue
hrefs = hrefs1
# OK, check if all hrefs have corresponding files # OK, check if all hrefs have corresponding files
if self.sync_packages: if self.sync_packages:
should_update = False should_update = False
@ -451,6 +478,7 @@ class SyncBase:
should_update = True should_update = True
break break
if should_update: if should_update:
logger.info("add %s as it's missing packages", package_name)
to_update.append(package_name) to_update.append(package_name)
self.parallel_update(to_update, []) self.parallel_update(to_update, [])
@ -507,10 +535,8 @@ class SyncBase:
# To make this less noisy... # To make this less noisy...
logger.info("removing %s", package_name) logger.info("removing %s", package_name)
package_simple_dir = self.simple_dir / package_name package_simple_dir = self.simple_dir / package_name
index_html = package_simple_dir / "index.html" packages_to_remove = get_existing_hrefs(package_simple_dir)
if index_html.exists(): if packages_to_remove:
with open(index_html) as f:
packages_to_remove = get_packages_from_index_html(f.read())
paths_to_remove = [package_simple_dir / p for p in packages_to_remove] paths_to_remove = [package_simple_dir / p for p in packages_to_remove]
for p in paths_to_remove: for p in paths_to_remove:
if p.exists(): if p.exists():
@ -986,6 +1012,8 @@ def do_update(
basedir: Path = ctx.obj["basedir"] basedir: Path = ctx.obj["basedir"]
local_db: LocalVersionKV = ctx.obj["local_db"] local_db: LocalVersionKV = ctx.obj["local_db"]
excludes = exclude_to_excludes(exclude) excludes = exclude_to_excludes(exclude)
if excludes:
logger.warning("--exclude is ignored in do_update()")
prerelease_excludes = exclude_to_excludes(prerelease_exclude) prerelease_excludes = exclude_to_excludes(prerelease_exclude)
syncer = get_syncer(basedir, local_db, sync_packages, shadowmire_upstream) syncer = get_syncer(basedir, local_db, sync_packages, shadowmire_upstream)
syncer.do_update(package_name, prerelease_excludes) syncer.do_update(package_name, prerelease_excludes)
@ -1007,8 +1035,6 @@ def do_remove(
local_db = ctx.obj["local_db"] local_db = ctx.obj["local_db"]
if exclude or prerelease_exclude: if exclude or prerelease_exclude:
logger.warning("exclusion rules are ignored in do_remove()") logger.warning("exclusion rules are ignored in do_remove()")
# excludes = exclude_to_excludes(exclude)
# prerelease_excludes = exclude_to_excludes(prerelease_exclude)
syncer = get_syncer(basedir, local_db, sync_packages, shadowmire_upstream) syncer = get_syncer(basedir, local_db, sync_packages, shadowmire_upstream)
syncer.do_remove(package_name) syncer.do_remove(package_name)