增加对release数量判断,避免不必要的api请求

This commit is contained in:
莫振宇 2024-01-22 16:52:46 +08:00
parent 7b1d7701c6
commit e647cf614a

View File

@ -21,20 +21,21 @@ REPOS = []
TIMEOUT_OPTION = (7, 10) TIMEOUT_OPTION = (7, 10)
total_size = 0 total_size = 0
def sizeof_fmt(num, suffix='iB'):
for unit in ['','K','M','G','T','P','E','Z']: def sizeof_fmt(num, suffix="iB"):
for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]:
if abs(num) < 1024.0: if abs(num) < 1024.0:
return "%3.2f%s%s" % (num, unit, suffix) return "%3.2f%s%s" % (num, unit, suffix)
num /= 1024.0 num /= 1024.0
return "%.2f%s%s" % (num, 'Y', suffix) return "%.2f%s%s" % (num, "Y", suffix)
# wrap around requests.get to use token if available # wrap around requests.get to use token if available
def github_get(*args, **kwargs): def github_get(*args, **kwargs):
headers = kwargs['headers'] if 'headers' in kwargs else {} headers = kwargs["headers"] if "headers" in kwargs else {}
if 'GITHUB_TOKEN' in os.environ: if "GITHUB_TOKEN" in os.environ:
headers['Authorization'] = 'token {}'.format( headers["Authorization"] = "token {}".format(os.environ["GITHUB_TOKEN"])
os.environ['GITHUB_TOKEN']) kwargs["headers"] = headers
kwargs['headers'] = headers
return requests.get(*args, **kwargs) return requests.get(*args, **kwargs)
@ -44,7 +45,12 @@ def do_download(remote_url: str, dst_file: Path, remote_ts: float, remote_size:
r.raise_for_status() r.raise_for_status()
tmp_dst_file = None tmp_dst_file = None
try: try:
with tempfile.NamedTemporaryFile(prefix="." + dst_file.name + ".", suffix=".tmp", dir=dst_file.parent, delete=False) as f: with tempfile.NamedTemporaryFile(
prefix="." + dst_file.name + ".",
suffix=".tmp",
dir=dst_file.parent,
delete=False,
) as f:
tmp_dst_file = Path(f.name) tmp_dst_file = Path(f.name)
for chunk in r.iter_content(chunk_size=1024**2): for chunk in r.iter_content(chunk_size=1024**2):
if chunk: # filter out keep-alive new chunks if chunk: # filter out keep-alive new chunks
@ -53,7 +59,9 @@ def do_download(remote_url: str, dst_file: Path, remote_ts: float, remote_size:
# check for downloaded size # check for downloaded size
downloaded_size = tmp_dst_file.stat().st_size downloaded_size = tmp_dst_file.stat().st_size
if remote_size != -1 and downloaded_size != remote_size: if remote_size != -1 and downloaded_size != remote_size:
raise Exception(f'File {dst_file.as_posix()} size mismatch: downloaded {downloaded_size} bytes, expected {remote_size} bytes') raise Exception(
f"File {dst_file.as_posix()} size mismatch: downloaded {downloaded_size} bytes, expected {remote_size} bytes"
)
os.utime(tmp_dst_file, (remote_ts, remote_ts)) os.utime(tmp_dst_file, (remote_ts, remote_ts))
tmp_dst_file.chmod(0o644) tmp_dst_file.chmod(0o644)
tmp_dst_file.replace(dst_file) tmp_dst_file.replace(dst_file)
@ -71,8 +79,7 @@ def downloading_worker(q):
url, dst_file, working_dir, updated, remote_size = item url, dst_file, working_dir, updated, remote_size = item
print("downloading", url, "to", print("downloading", url, "to", dst_file.relative_to(working_dir), flush=True)
dst_file.relative_to(working_dir), flush=True)
try: try:
do_download(url, dst_file, updated, remote_size) do_download(url, dst_file, updated, remote_size)
except Exception: except Exception:
@ -86,30 +93,35 @@ def downloading_worker(q):
def create_workers(n): def create_workers(n):
task_queue = queue.Queue() task_queue = queue.Queue()
for i in range(n): for i in range(n):
t = threading.Thread(target=downloading_worker, args=(task_queue, )) t = threading.Thread(target=downloading_worker, args=(task_queue,))
t.start() t.start()
return task_queue return task_queue
def ensure_safe_name(filename): def ensure_safe_name(filename):
filename = filename.replace('\0', ' ') filename = filename.replace("\0", " ")
if filename == '.': if filename == ".":
return ' .' return " ."
elif filename == '..': elif filename == "..":
return '. .' return ". ."
else: else:
return filename.replace('/', '\\').replace('\\', '_') return filename.replace("/", "\\").replace("\\", "_")
def main(): def main():
import argparse import argparse
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--base-url", default=BASE_URL) parser.add_argument("--base-url", default=BASE_URL)
parser.add_argument("--working-dir", default=WORKING_DIR) parser.add_argument("--working-dir", default=WORKING_DIR)
parser.add_argument("--workers", default=1, type=int, parser.add_argument(
help='number of concurrent downloading jobs') "--workers", default=1, type=int, help="number of concurrent downloading jobs"
parser.add_argument("--fast-skip", action='store_true', )
help='do not verify size and timestamp of existing files') parser.add_argument(
"--fast-skip",
action="store_true",
help="do not verify size and timestamp of existing files",
)
parser.add_argument("--config", default=CONFIG) parser.add_argument("--config", default=CONFIG)
args = parser.parse_args() args = parser.parse_args()
@ -124,14 +136,15 @@ def main():
with open(args.config, "r") as f: with open(args.config, "r") as f:
REPOS = json.load(f) REPOS = json.load(f)
def download(release, release_dir, tarball = False): def download(release, release_dir, tarball=False):
global total_size global total_size
if tarball: if tarball:
url = release['tarball_url'] url = release["tarball_url"]
updated = datetime.strptime( updated = datetime.strptime(
release['published_at'], '%Y-%m-%dT%H:%M:%SZ').timestamp() release["published_at"], "%Y-%m-%dT%H:%M:%SZ"
dst_file = release_dir / 'repo-snapshot.tar.gz' ).timestamp()
dst_file = release_dir / "repo-snapshot.tar.gz"
remote_filelist.append(dst_file.relative_to(working_dir)) remote_filelist.append(dst_file.relative_to(working_dir))
if dst_file.is_file(): if dst_file.is_file():
@ -141,19 +154,21 @@ def main():
# tarball has no size information, use -1 to skip size check # tarball has no size information, use -1 to skip size check
task_queue.put((url, dst_file, working_dir, updated, -1)) task_queue.put((url, dst_file, working_dir, updated, -1))
for asset in release['assets']: for asset in release["assets"]:
url = asset['browser_download_url'] url = asset["browser_download_url"]
updated = datetime.strptime( updated = datetime.strptime(
asset['updated_at'], '%Y-%m-%dT%H:%M:%SZ').timestamp() asset["updated_at"], "%Y-%m-%dT%H:%M:%SZ"
dst_file = release_dir / ensure_safe_name(asset['name']) ).timestamp()
dst_file = release_dir / ensure_safe_name(asset["name"])
remote_filelist.append(dst_file.relative_to(working_dir)) remote_filelist.append(dst_file.relative_to(working_dir))
remote_size = asset['size'] remote_size = asset["size"]
total_size += remote_size total_size += remote_size
if dst_file.is_file(): if dst_file.is_file():
if args.fast_skip: if args.fast_skip:
print("fast skipping", dst_file.relative_to( print(
working_dir), flush=True) "fast skipping", dst_file.relative_to(working_dir), flush=True
)
continue continue
else: else:
stat = dst_file.stat() stat = dst_file.stat()
@ -161,10 +176,12 @@ def main():
local_mtime = stat.st_mtime local_mtime = stat.st_mtime
# print(f"{local_filesize} vs {asset['size']}") # print(f"{local_filesize} vs {asset['size']}")
# print(f"{local_mtime} vs {updated}") # print(f"{local_mtime} vs {updated}")
if local_mtime > updated or \ if (
remote_size == local_filesize and local_mtime == updated: local_mtime > updated
print("skipping", dst_file.relative_to( or remote_size == local_filesize
working_dir), flush=True) and local_mtime == updated
):
print("skipping", dst_file.relative_to(working_dir), flush=True)
continue continue
else: else:
dst_file.parent.mkdir(parents=True, exist_ok=True) dst_file.parent.mkdir(parents=True, exist_ok=True)
@ -210,10 +227,14 @@ def main():
r = github_get(url_str, headers=headers) r = github_get(url_str, headers=headers)
r.raise_for_status() r.raise_for_status()
releases.extend(r.json()) releases.extend(r.json())
next_url = [ s.split(";")[0].strip()[1:-1] for s in r.headers["link"].split(",") if s.split(";")[1].strip() == 'rel="next"' ] next_url = [
s.split(";")[0].strip()[1:-1]
for s in r.headers["link"].split(",")
if s.split(";")[1].strip() == 'rel="next"'
]
if next_url: if next_url:
url_str = next_url[0] url_str = next_url[0]
else: elif versions > 0 and len(releases) > versions:
url_str = None url_str = None
except: except:
traceback.print_exc() traceback.print_exc()
@ -221,8 +242,8 @@ def main():
n_downloaded = 0 n_downloaded = 0
for release in releases: for release in releases:
if not release['draft'] and (prerelease or not release['prerelease']): if not release["draft"] and (prerelease or not release["prerelease"]):
name = ensure_safe_name(release['name'] or release['tag_name']) name = ensure_safe_name(release["name"] or release["tag_name"])
if len(name) == 0: if len(name) == 0:
print("Error: Unnamed release") print("Error: Unnamed release")
continue continue
@ -247,7 +268,7 @@ def main():
if cleaning: if cleaning:
local_filelist = [] local_filelist = []
for local_file in working_dir.glob('**/*'): for local_file in working_dir.glob("**/*"):
if local_file.is_file(): if local_file.is_file():
local_filelist.append(local_file.relative_to(working_dir)) local_filelist.append(local_file.relative_to(working_dir))
@ -256,7 +277,7 @@ def main():
old_file = working_dir / old_file old_file = working_dir / old_file
old_file.unlink() old_file.unlink()
for local_dir in working_dir.glob('*/*/*'): for local_dir in working_dir.glob("*/*/*"):
if local_dir.is_dir(): if local_dir.is_dir():
try: try:
# remove empty dirs only # remove empty dirs only
@ -266,6 +287,7 @@ def main():
print("Total size is", sizeof_fmt(total_size, suffix="")) print("Total size is", sizeof_fmt(total_size, suffix=""))
if __name__ == "__main__": if __name__ == "__main__":
main() main()