From 4afe8ac16daa1c8227c5947ea294018c14566903 Mon Sep 17 00:00:00 2001
From: dramforever <dramforever@live.com>
Date: Mon, 23 Mar 2020 22:04:53 +0800
Subject: [PATCH] nixos-images: update

https://discourse.nixos.org/t/announcement-moving-nixos-org-to-netlify/6212
---
 nixos-images.py | 188 ++++++++++--------------------------------------
 1 file changed, 39 insertions(+), 149 deletions(-)

diff --git a/nixos-images.py b/nixos-images.py
index 9403876..809973b 100755
--- a/nixos-images.py
+++ b/nixos-images.py
@@ -2,6 +2,7 @@
 import hashlib
 import logging
 import lzma
+import minio
 import os
 import re
 import sys
@@ -11,6 +12,7 @@ import subprocess
 from pyquery import PyQuery as pq
 from datetime import datetime, timedelta
 from pathlib import Path
+from collections import defaultdict
 
 from urllib3.util.retry import Retry
 
@@ -38,14 +40,8 @@ def http_head(*args, **kwargs):
 def http_get(*args, **kwargs):
     return session.get(*args, timeout=TIMEOUT, **kwargs)
 
-def file_sha256(dest):
-    sha = subprocess.check_output(
-        [ 'sha256sum', str(dest) ],
-        universal_newlines=True
-    )
-    return sha.split(' ')[0]
-
 def atomic_write_file(dest, contents):
+    dest.parent.mkdir(parents=True, exist_ok=True)
     tmp_dest = dest.parent / f'.{dest.name}.tmp'
     with tmp_dest.open('w') as f:
         f.write(contents)
@@ -93,156 +89,50 @@ def download(url, dest):
 
     download_dest.rename(dest)
 
-def get_links(url):
-    r = http_get(url)
-    r.raise_for_status()
+client = minio.Minio('s3.amazonaws.com')
 
-    node = pq(r.content)
-
-    links = []
-    for row in node('tr'):
-        td = pq(row)('td')
-        if len(td) != 5:
-            continue
-
-        link_target = td[1].find('a').get('href')
-        if link_target.startswith('/'):
-            # Link to parent directory
-            continue
-
-        last_updated = td[2].text.strip()
-
-        links.append((link_target, last_updated))
-
-    return links
-
-def get_channel(chan_location):
-    release_res = http_get(chan_location)
-    release_res.raise_for_status()
-
-    node = pq(release_res.text)
-
-    tagline = node('p').text()
-
-    tagline_res = re.match(r'^Released on (.+) from', tagline)
-
-    assert tagline_res is not None
-
-    released_time = tagline_res[1]
-
-    files = []
-
-    for row in node('tr'):
-        td = pq(row)('td')
-        if len(td) != 3:
-            continue
-        file_name, file_size, file_hash = (pq(x).text() for x in td)
-        files.append((file_name, file_size, file_hash))
-
-    return {
-        'released_time': released_time,
-        'files': files
-    }
+def get_url(name):
+    response = client.get_object('nix-channels', name)
+    return response.headers['x-amz-website-redirect-location']
 
 def clone_images():
-    for channel, chan_updated in get_links(f'{UPSTREAM_URL}/'):
-        if not channel.startswith('nixos-') \
-            or channel.endswith('-small') \
-            or channel == 'nixos-unstable':
+    DOWNLOAD_MATCH = r'nixos-\d\d.\d\d/latest-nixos-\w+-\w+-linux.\w+(.sha256)?'
+
+    object_names = [
+        x.object_name
+        for x in client.list_objects_v2('nix-channels', recursive=True)
+        if re.fullmatch(DOWNLOAD_MATCH, x.object_name)
+    ]
+
+    channels = defaultdict(lambda: [])
+
+    for name in object_names:
+        chan, file = name.split('/', 1)
+        channels[chan].append(file)
+
+    for channel, files in channels.items():
+        chan_dir = working_dir / channel
+        git_rev = http_get(get_url(f'{channel}/git-revision')).text
+        git_rev_path = chan_dir / 'git-revision'
+
+        if git_rev_path.exists() and git_rev == git_rev_path.read_text():
             continue
 
-        if datetime.strptime(chan_updated, '%Y-%m-%d %H:%M') < CLONE_SINCE:
-            continue
+        logging.info(f'- {channel} -> {git_rev}')
 
-        chan_path = working_dir / channel
-        chan_path.mkdir(parents=True, exist_ok=True)
+        for file in files:
+            logging.info(f'  - {file}')
+            url = get_url(f'{channel}/{file}')
 
-        res = http_head(f'{UPSTREAM_URL}/{channel}', allow_redirects=False)
-        res.raise_for_status()
+            try:
+                download(url, chan_dir / file)
+            except requests.HTTPError as e:
+                if e.response.status_code == 404:
+                    logging.info(f'    - 404, skipped')
+                else:
+                    raise
 
-        chan_location = res.headers['Location']
-        chan_release_basename = chan_location.split('/')[-1]
-
-        try:
-            last_url = (chan_path / '.last-url').read_text()
-        except (IOError, OSError):
-            last_url = 'not available'
-
-        if chan_location == last_url:
-            continue
-
-        logging.info(f'- {channel} -> {chan_release_basename}')
-
-        # Matches nixos-19.03 -> nixos-19.03beta171840.23fd1394dc6
-        #                        ^-------------^
-        if chan_release_basename.startswith(channel + 'beta'):
-            logging.info(f'  - Beta channel, not updating')
-            continue
-
-        chan_info = get_channel(chan_location)
-
-        atomic_write_file(chan_path / '.released-time', chan_info['released_time'])
-
-        has_hash_fail = False
-
-        keep_files = { '.last-url', '.released-time' }
-        rename_files = []
-
-        logging.info(f'  - Downloading new files')
-
-        chan_version = channel.split('-', 1)[1]
-
-        chan_release_version = chan_release_basename.split('-', 1)[1]
-
-        simplify_name = lambda fname: fname.replace(f'-{chan_release_version}-', f'-{chan_version}-')
-
-        image_files = [
-            (simplify_name(file_name), file_name, file_hash)
-            for file_name, _file_size, file_hash in chan_info['files']
-            if file_name.endswith('.iso') or file_name.endswith('ova')
-        ]
-
-        for mirror_file_name, upstream_file_name, file_hash in image_files: 
-            keep_files.add(mirror_file_name)
-            logging.info(f'    - {upstream_file_name} -> {mirror_file_name}')
-            tmp_dest = f'.update.{upstream_file_name}'
-            rename_files.append((tmp_dest, mirror_file_name))
-
-            download(f'{chan_location}/{upstream_file_name}', chan_path / tmp_dest)
-            actual_hash = file_sha256(chan_path / tmp_dest)
-
-            if file_hash != actual_hash:
-                has_hash_fail = True
-                logging.error(f'      - Incorrect hash')
-                logging.error(f'        actual   {actual_hash}')
-                logging.error(f'        expected {file_hash}')
-                logging.info(f'      - File saved as {tmp_dest}')
-
-        if has_hash_fail:
-            logging.warn(f'  - Found bad files. Will retry next time.')
-        else:
-            logging.info(f'  - Renaming files')
-
-            for tmp_dest, mirror_file_name in rename_files:
-                (chan_path / tmp_dest).rename(chan_path / mirror_file_name)
-
-            logging.info(f'  - Removing useless files')
-
-            for file_path in chan_path.iterdir():
-                file_name = file_path.name
-
-                if file_name not in keep_files:
-                    logging.info(f'    - {file_name}')
-                    file_path.unlink()
-
-            logging.info(f'  - Writing SHA256SUMS')
-
-            with (chan_path / 'SHA256SUMS').open('w') as f:
-                for mirror_file_name, _upstream_file_name, file_hash in image_files:
-                    f.write(f'{file_hash} *{mirror_file_name}\n')
-
-            logging.info(f'  - Update finished')
-            atomic_write_file(chan_path / '.last-url', chan_location)
+        atomic_write_file(git_rev_path, git_rev)
 
 if __name__ == "__main__":
     clone_images()