Add github-raw script

which download raw content from github Note: not github releases
2025-06-30 23:12:45 +00:00 · 2021-11-18 17:28:18 +08:00 · 2021-11-18 17:28:18 +08:00 · 7f47e5cbc0
commit 7f47e5cbc0
parent a9b8ec33cd
1 changed files with 169 additions and 0 deletions
--- a/github-raw.py
+++ b/github-raw.py
@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+import os
+import sys
+import threading
+import traceback
+import queue
+from pathlib import Path
+from datetime import datetime
+import tempfile
+import hashlib
+
+import requests
+
+
+BASE_URL = os.getenv("TUNASYNC_UPSTREAM_URL", "https://api.github.com/repos/")
+WORKING_DIR = os.getenv("TUNASYNC_WORKING_DIR")
+REPOS = [
+    # owner/repo, tree, tree, tree, blob
+    ["fpco/minghc", "master", "bin", "7z.exe"],
+    ["fpco/minghc", "master", "bin", "7z.dll"],
+]
+
+# connect and read timeout value
+TIMEOUT_OPTION = (7, 10)
+total_size = 0
+
+# wrap around requests.get to use token if available
+def github_get(*args, **kwargs):
+    headers = kwargs['headers'] if 'headers' in kwargs else {}
+    if 'GITHUB_TOKEN' in os.environ:
+        headers['Authorization'] = 'token {}'.format(
+            os.environ['GITHUB_TOKEN'])
+    kwargs['headers'] = headers
+    return requests.get(*args, **kwargs)
+
+def github_tree(*args, **kwargs):
+    headers = kwargs['headers'] if 'headers' in kwargs else {}
+    headers["Accept"] = "application/vnd.github.v3+json"
+    kwargs['headers'] = headers
+    return github_get(*args, **kwargs)
+
+# NOTE blob API supports file up to 100MB
+# To get larger one, we need raw.githubcontent, which is not implemented now
+def github_blob(*args, **kwargs):
+    headers = kwargs['headers'] if 'headers' in kwargs else {}
+    headers["Accept"] = "application/vnd.github.v3.raw"
+    kwargs['headers'] = headers
+    return github_get(*args, **kwargs)
+
+def do_download(remote_url: str, dst_file: Path, remote_size: int, sha: str):
+    # NOTE the stream=True parameter below
+    with github_blob(remote_url, stream=True) as r:
+        r.raise_for_status()
+        tmp_dst_file = None
+        try:
+            with tempfile.NamedTemporaryFile(prefix="." + dst_file.name + ".", suffix=".tmp", dir=dst_file.parent, delete=False) as f:
+                tmp_dst_file = Path(f.name)
+                for chunk in r.iter_content(chunk_size=1024**2):
+                    if chunk:  # filter out keep-alive new chunks
+                        f.write(chunk)
+                        # f.flush()
+            # check for downloaded size
+            downloaded_size = tmp_dst_file.stat().st_size
+            if remote_size != -1 and downloaded_size != remote_size:
+                raise Exception(f'File {dst_file.as_posix()} size mismatch: downloaded {downloaded_size} bytes, expected {remote_size} bytes')
+            tmp_dst_file.chmod(0o644)
+            target = dst_file.parent / ".sha" / sha
+            print("symlink", dst_file)
+            print("target", target)
+            target.parent.mkdir(parents=True, exist_ok=True)
+            tmp_dst_file.replace(target)
+            if dst_file.is_symlink():
+                origin = dst_file.parent / os.readlink(dst_file)
+                print("origin", origin)
+                dst_file.unlink()
+                origin.unlink()
+            dst_file.symlink_to(Path(".sha") / sha)
+        finally:
+            if not tmp_dst_file is None:
+                if tmp_dst_file.is_file():
+                    tmp_dst_file.unlink()
+
+def downloading_worker(q):
+    while True:
+        item = q.get()
+        if item is None:
+            break
+
+        dst_file = Path('/'.join(item))
+        dst_file.parent.mkdir(parents=True, exist_ok=True)
+
+        item.pop(0) # remove working dir
+        owner_repo = item.pop(0)
+        try:
+            tree = item.pop(0)
+            tree_child = item.pop(0)
+            child_is_leaf = False
+            url = ''
+            sha = ''
+            size = 0
+            while not child_is_leaf:
+                with github_tree(f"{BASE_URL}{owner_repo}/git/trees/{tree}") as r:
+                    r.raise_for_status()
+                    tree_json = r.json()
+                    for child in tree_json["tree"]:
+                        if tree_child == child["path"]:
+                            if child["type"] == "tree":
+                                tree = child["sha"]
+                                tree_child = item.pop(0)
+                            elif child["type"] == "blob":
+                                child_is_leaf = True
+                                url = child["url"]
+                                size = child["size"]
+                                sha = child["sha"]
+                            else:
+                                raise Exception
+                            break
+                    else:
+                        raise Exception
+            if not dst_file.is_symlink() or \
+                dst_file.stat().st_size != size or \
+                Path(os.readlink(dst_file)).name != sha:
+                do_download(url, dst_file, size, sha)
+            else:
+                print("Skip", dst_file)
+        except Exception as e:
+            print(e)
+            print("Failed to download", dst_file, flush=True)
+            if dst_file.is_file():
+                dst_file.unlink()
+
+        q.task_done()
+
+
+def create_workers(n):
+    task_queue = queue.Queue()
+    for i in range(n):
+        t = threading.Thread(target=downloading_worker, args=(task_queue, ))
+        t.start()
+    return task_queue
+
+def main():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--working-dir", default=WORKING_DIR)
+    parser.add_argument("--workers", default=1, type=int,
+                        help='number of concurrent downloading jobs')
+    args = parser.parse_args()
+
+    if args.working_dir is None:
+        raise Exception("Working Directory is None")
+
+    working_dir = args.working_dir
+    task_queue = create_workers(args.workers)
+
+    for cfg in REPOS:
+        cfg.insert(0, working_dir)
+        task_queue.put(cfg)
+
+    # block until all tasks are done
+    task_queue.join()
+    # stop workers
+    for i in range(args.workers):
+        task_queue.put(None)
+
+if __name__ == "__main__":
+    main()
+
+# vim: ts=4 sw=4 sts=4 expandtab