diff --git a/nixos-images.py b/nixos-images.py new file mode 100755 index 0000000..9403876 --- /dev/null +++ b/nixos-images.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python3 +import hashlib +import logging +import lzma +import os +import re +import sys +import requests +import subprocess + +from pyquery import PyQuery as pq +from datetime import datetime, timedelta +from pathlib import Path + +from urllib3.util.retry import Retry + +UPSTREAM_URL = os.getenv('TUNASYNC_UPSTREAM_URL', 'https://nixos.org/channels') +WORKING_DIR = os.getenv('TUNASYNC_WORKING_DIR', 'working-images') +CLONE_SINCE = datetime(2018, 12, 1) +TIMEOUT = 60 + +working_dir = Path(WORKING_DIR) + +session = requests.Session() +retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ]) +retry_adapter = requests.adapters.HTTPAdapter(max_retries=retries) +session.mount('http://', retry_adapter) +session.mount('https://', retry_adapter) + +logging.basicConfig( + level=logging.INFO, + format='[%(asctime)s] %(levelname)-8s %(message)s' +) + +def http_head(*args, **kwargs): + return session.head(*args, timeout=TIMEOUT, **kwargs) + +def http_get(*args, **kwargs): + return session.get(*args, timeout=TIMEOUT, **kwargs) + +def file_sha256(dest): + sha = subprocess.check_output( + [ 'sha256sum', str(dest) ], + universal_newlines=True + ) + return sha.split(' ')[0] + +def atomic_write_file(dest, contents): + tmp_dest = dest.parent / f'.{dest.name}.tmp' + with tmp_dest.open('w') as f: + f.write(contents) + tmp_dest.rename(dest) + +class WrongSize(RuntimeError): + def __init__(self, expected, actual): + super().__init__(f'Wrong file size: expected {expected}, actual {actual}') + self.actual = actual + self.expected = expected + +def download(url, dest): + dest.parent.mkdir(parents=True, exist_ok=True) + download_dest = dest.parent / f'.{dest.name}.tmp' + + retry = retries + + while True: + with http_get(url, stream=True) as res: + res.raise_for_status() + try: + with download_dest.open('wb') as f: + for chunk in res.iter_content(chunk_size=64 * 1024 * 1024): + if chunk: + f.write(chunk) + actual_size = download_dest.stat().st_size + if 'Content-Length' in res.headers: + expected_size = int(res.headers['Content-Length']) + if actual_size != expected_size: + raise WrongSize(expected=expected_size, actual=actual_size) + + break + except (requests.exceptions.ConnectionError, WrongSize) as e: + logging.warn(e) + next_retry = retry.increment( + method='GET', + url=url, + error=e + ) + if next_retry is None: + raise e + else: + retry = next_retry + logging.warn(f'Retrying download: {retry}') + + download_dest.rename(dest) + +def get_links(url): + r = http_get(url) + r.raise_for_status() + + node = pq(r.content) + + links = [] + for row in node('tr'): + td = pq(row)('td') + if len(td) != 5: + continue + + link_target = td[1].find('a').get('href') + if link_target.startswith('/'): + # Link to parent directory + continue + + last_updated = td[2].text.strip() + + links.append((link_target, last_updated)) + + return links + +def get_channel(chan_location): + release_res = http_get(chan_location) + release_res.raise_for_status() + + node = pq(release_res.text) + + tagline = node('p').text() + + tagline_res = re.match(r'^Released on (.+) from', tagline) + + assert tagline_res is not None + + released_time = tagline_res[1] + + files = [] + + for row in node('tr'): + td = pq(row)('td') + if len(td) != 3: + continue + file_name, file_size, file_hash = (pq(x).text() for x in td) + files.append((file_name, file_size, file_hash)) + + return { + 'released_time': released_time, + 'files': files + } + +def clone_images(): + for channel, chan_updated in get_links(f'{UPSTREAM_URL}/'): + if not channel.startswith('nixos-') \ + or channel.endswith('-small') \ + or channel == 'nixos-unstable': + continue + + if datetime.strptime(chan_updated, '%Y-%m-%d %H:%M') < CLONE_SINCE: + continue + + chan_path = working_dir / channel + chan_path.mkdir(parents=True, exist_ok=True) + + res = http_head(f'{UPSTREAM_URL}/{channel}', allow_redirects=False) + res.raise_for_status() + + chan_location = res.headers['Location'] + chan_release_basename = chan_location.split('/')[-1] + + try: + last_url = (chan_path / '.last-url').read_text() + except (IOError, OSError): + last_url = 'not available' + + if chan_location == last_url: + continue + + logging.info(f'- {channel} -> {chan_release_basename}') + + # Matches nixos-19.03 -> nixos-19.03beta171840.23fd1394dc6 + # ^-------------^ + if chan_release_basename.startswith(channel + 'beta'): + logging.info(f' - Beta channel, not updating') + continue + + chan_info = get_channel(chan_location) + + atomic_write_file(chan_path / '.released-time', chan_info['released_time']) + + has_hash_fail = False + + keep_files = { '.last-url', '.released-time' } + rename_files = [] + + logging.info(f' - Downloading new files') + + chan_version = channel.split('-', 1)[1] + + chan_release_version = chan_release_basename.split('-', 1)[1] + + simplify_name = lambda fname: fname.replace(f'-{chan_release_version}-', f'-{chan_version}-') + + image_files = [ + (simplify_name(file_name), file_name, file_hash) + for file_name, _file_size, file_hash in chan_info['files'] + if file_name.endswith('.iso') or file_name.endswith('ova') + ] + + for mirror_file_name, upstream_file_name, file_hash in image_files: + keep_files.add(mirror_file_name) + logging.info(f' - {upstream_file_name} -> {mirror_file_name}') + tmp_dest = f'.update.{upstream_file_name}' + rename_files.append((tmp_dest, mirror_file_name)) + + download(f'{chan_location}/{upstream_file_name}', chan_path / tmp_dest) + actual_hash = file_sha256(chan_path / tmp_dest) + + if file_hash != actual_hash: + has_hash_fail = True + logging.error(f' - Incorrect hash') + logging.error(f' actual {actual_hash}') + logging.error(f' expected {file_hash}') + logging.info(f' - File saved as {tmp_dest}') + + if has_hash_fail: + logging.warn(f' - Found bad files. Will retry next time.') + else: + logging.info(f' - Renaming files') + + for tmp_dest, mirror_file_name in rename_files: + (chan_path / tmp_dest).rename(chan_path / mirror_file_name) + + logging.info(f' - Removing useless files') + + for file_path in chan_path.iterdir(): + file_name = file_path.name + + if file_name not in keep_files: + logging.info(f' - {file_name}') + file_path.unlink() + + logging.info(f' - Writing SHA256SUMS') + + with (chan_path / 'SHA256SUMS').open('w') as f: + for mirror_file_name, _upstream_file_name, file_hash in image_files: + f.write(f'{file_hash} *{mirror_file_name}\n') + + logging.info(f' - Update finished') + atomic_write_file(chan_path / '.last-url', chan_location) + +if __name__ == "__main__": + clone_images()