mirror of
https://github.com/tuna/tunasync-scripts.git
synced 2025-04-20 04:12:42 +00:00
143 lines
4.1 KiB
Python
Executable File
143 lines
4.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
import os
|
|
import subprocess as sp
|
|
from pathlib import Path
|
|
from email.utils import parsedate_to_datetime
|
|
|
|
import requests
|
|
from pyquery import PyQuery as pq
|
|
|
|
|
|
BASE_URL = os.getenv("TUNASYNC_UPSTREAM_URL", "https://download.docker.com/")
|
|
WORKING_DIR = os.getenv("TUNASYNC_WORKING_DIR")
|
|
|
|
|
|
class RemoteSite:
|
|
|
|
def __init__(self, base_url=BASE_URL):
|
|
if not base_url.endswith('/'):
|
|
base_url = base_url + '/'
|
|
self.base_url = base_url
|
|
self.meta_urls = []
|
|
|
|
def is_metafile_url(self, url):
|
|
deb_dists=('debian', 'ubuntu', 'raspbian')
|
|
rpm_dists=('fedora', 'centos')
|
|
|
|
for dist in deb_dists:
|
|
if '/'+dist+'/' not in url:
|
|
continue
|
|
if '/Contents-' in url:
|
|
return True
|
|
if '/binary-' in url:
|
|
return True
|
|
if 'Release' in url:
|
|
return True
|
|
|
|
for dist in rpm_dists:
|
|
if '/'+dist+'/' not in url:
|
|
continue
|
|
if '/repodata/' in url:
|
|
return True
|
|
|
|
return False
|
|
|
|
def recursive_get_filelist(self, base_url, filter_meta=False):
|
|
if not base_url.endswith('/'):
|
|
yield base_url
|
|
return
|
|
|
|
r = requests.get(base_url)
|
|
if not r.ok:
|
|
return
|
|
|
|
d = pq(r.text)
|
|
for link in d('a'):
|
|
if link.text.startswith('..'):
|
|
continue
|
|
href = base_url + link.text
|
|
if filter_meta and self.is_metafile_url(href):
|
|
self.meta_urls.append(href)
|
|
elif link.text.endswith('/'):
|
|
yield from self.recursive_get_filelist(href, filter_meta=filter_meta)
|
|
else:
|
|
yield href
|
|
|
|
def relpath(self, url):
|
|
assert url.startswith(self.base_url)
|
|
return url[len(self.base_url):]
|
|
|
|
@property
|
|
def files(self):
|
|
yield from self.recursive_get_filelist(self.base_url, filter_meta=True)
|
|
for url in self.meta_urls:
|
|
yield from self.recursive_get_filelist(url, filter_meta=False)
|
|
|
|
|
|
def curl_download(remote_url: str, dst_file: Path):
|
|
sp.check_call([
|
|
"curl", "-o", str(dst_file),
|
|
"-sL", "--remote-time", "--show-error",
|
|
"--fail", remote_url,
|
|
])
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--base-url", default=BASE_URL)
|
|
parser.add_argument("--working-dir", default=WORKING_DIR)
|
|
args = parser.parse_args()
|
|
|
|
if args.working_dir is None:
|
|
raise Exception("Working Directory is None")
|
|
|
|
working_dir = Path(args.working_dir)
|
|
|
|
remote_filelist = []
|
|
rs = RemoteSite(args.base_url)
|
|
for url in rs.files:
|
|
dst_file = working_dir / rs.relpath(url)
|
|
remote_filelist.append(dst_file.relative_to(working_dir))
|
|
|
|
if dst_file.is_file():
|
|
r = requests.head(url)
|
|
remote_filesize = int(r.headers['content-length'])
|
|
remote_date = parsedate_to_datetime(r.headers['last-modified'])
|
|
stat = dst_file.stat()
|
|
local_filesize = stat.st_size
|
|
local_mtime = stat.st_mtime
|
|
|
|
if remote_filesize == local_filesize and remote_date.timestamp() == local_mtime:
|
|
print("Skipping", dst_file.relative_to(working_dir), flush=True)
|
|
continue
|
|
|
|
dst_file.unlink()
|
|
else:
|
|
dst_file.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
print("downloading", url, flush=True)
|
|
try:
|
|
curl_download(url, dst_file)
|
|
except Exception:
|
|
print("Failed to download", url, flush=True)
|
|
if dst_file.is_file():
|
|
dst_file.unlink()
|
|
|
|
local_filelist = []
|
|
for local_file in working_dir.glob('**/*'):
|
|
if local_file.is_file():
|
|
local_filelist.append(local_file.relative_to(working_dir))
|
|
|
|
for old_file in set(local_filelist) - set(remote_filelist):
|
|
print("deleting", old_file, flush=True)
|
|
old_file = working_dir / old_file
|
|
old_file.unlink()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|
|
|
|
# vim: ts=4 sw=4 sts=4 expandtab
|