Merge pull request #62 from dramforever/nix-new

nix-channels and nixos-images update due to upstream change
This commit is contained in:
Yuxiang Zhang 2020-03-23 22:15:04 +08:00 committed by GitHub
commit fc9929258c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 60 additions and 182 deletions

View File

@ -1,7 +1,7 @@
FROM python:3-buster
MAINTAINER Wang Ruikang <dramforever@live.com>
RUN pip3 install pyquery requests && \
RUN pip3 install pyquery requests minio && \
# Install Nix. To simplify management we only copy binaries and create
# symlinks, and do no further configuration
curl https://mirrors.tuna.tsinghua.edu.cn/nix/nix-2.3.2/nix-2.3.2-x86_64-linux.tar.xz -o /tmp/nix.tar.xz && \

View File

@ -3,11 +3,13 @@ import hashlib
import json
import logging
import lzma
import minio
import os
import pytz
import re
import sys
import requests
import subprocess
import sys
from pyquery import PyQuery as pq
from datetime import datetime, timedelta
@ -35,7 +37,12 @@ RETAIN_DAYS = float(os.getenv('NIX_MIRROR_RETAIN_DAYS', 30))
STORE_DIR = 'store'
RELEASES_DIR = 'releases'
CLONE_SINCE = datetime(2018, 12, 1)
# Channels that have not updated since migration to Netlify [1] are assumed to
# be too old and defunct.
#
# [1]: https://discourse.nixos.org/t/announcement-moving-nixos-org-to-netlify/6212
CLONE_SINCE = datetime(2020, 3, 6, tzinfo=pytz.utc)
TIMEOUT = 60
working_dir = Path(WORKING_DIR)
@ -63,9 +70,6 @@ logging.basicConfig(
# Don't forget 'global failure'
failure = False
def http_head(*args, **kwargs):
return session.head(*args, timeout=TIMEOUT, **kwargs)
def http_get(*args, **kwargs):
return session.get(*args, timeout=TIMEOUT, **kwargs)
@ -131,28 +135,14 @@ def download(url, dest):
download_dest.rename(dest)
def get_links(url):
r = http_get(url)
r.raise_for_status()
client = minio.Minio('s3.amazonaws.com')
node = pq(r.content)
links = []
for row in node('tr'):
td = pq(row)('td')
if len(td) != 5:
continue
link_target = td[1].find('a').get('href')
if link_target.startswith('/'):
# Link to parent directory
continue
last_updated = td[2].text.strip()
links.append((link_target, last_updated))
return links
def get_channels():
return [
(x.object_name, x.last_modified)
for x in client.list_objects_v2('nix-channels')
if re.fullmatch(r'(nixos|nixpkgs)-.+[^/]', x.object_name)
]
def clone_channels():
logging.info(f'- Fetching channels')
@ -161,17 +151,15 @@ def clone_channels():
working_dir.mkdir(parents=True, exist_ok=True)
for channel, chan_updated in get_links(f'{UPSTREAM_URL}/'):
for channel, chan_updated in get_channels():
chan_path = working_dir / channel
# Old channels, little value in cloning and format changes
if datetime.strptime(chan_updated, '%Y-%m-%d %H:%M') < CLONE_SINCE:
if chan_updated < CLONE_SINCE:
continue
chan_redirect_res = http_head(f'{UPSTREAM_URL}/{channel}', allow_redirects=False)
chan_redirect_res.raise_for_status()
chan_location = chan_redirect_res.headers['Location']
chan_obj = client.get_object('nix-channels', channel)
chan_location = chan_obj.headers['x-amz-website-redirect-location']
chan_release = chan_location.split('/')[-1]

View File

@ -2,6 +2,7 @@
import hashlib
import logging
import lzma
import minio
import os
import re
import sys
@ -11,6 +12,7 @@ import subprocess
from pyquery import PyQuery as pq
from datetime import datetime, timedelta
from pathlib import Path
from collections import defaultdict
from urllib3.util.retry import Retry
@ -38,14 +40,8 @@ def http_head(*args, **kwargs):
def http_get(*args, **kwargs):
return session.get(*args, timeout=TIMEOUT, **kwargs)
def file_sha256(dest):
sha = subprocess.check_output(
[ 'sha256sum', str(dest) ],
universal_newlines=True
)
return sha.split(' ')[0]
def atomic_write_file(dest, contents):
dest.parent.mkdir(parents=True, exist_ok=True)
tmp_dest = dest.parent / f'.{dest.name}.tmp'
with tmp_dest.open('w') as f:
f.write(contents)
@ -93,156 +89,50 @@ def download(url, dest):
download_dest.rename(dest)
def get_links(url):
r = http_get(url)
r.raise_for_status()
client = minio.Minio('s3.amazonaws.com')
node = pq(r.content)
links = []
for row in node('tr'):
td = pq(row)('td')
if len(td) != 5:
continue
link_target = td[1].find('a').get('href')
if link_target.startswith('/'):
# Link to parent directory
continue
last_updated = td[2].text.strip()
links.append((link_target, last_updated))
return links
def get_channel(chan_location):
release_res = http_get(chan_location)
release_res.raise_for_status()
node = pq(release_res.text)
tagline = node('p').text()
tagline_res = re.match(r'^Released on (.+) from', tagline)
assert tagline_res is not None
released_time = tagline_res[1]
files = []
for row in node('tr'):
td = pq(row)('td')
if len(td) != 3:
continue
file_name, file_size, file_hash = (pq(x).text() for x in td)
files.append((file_name, file_size, file_hash))
return {
'released_time': released_time,
'files': files
}
def get_url(name):
response = client.get_object('nix-channels', name)
return response.headers['x-amz-website-redirect-location']
def clone_images():
for channel, chan_updated in get_links(f'{UPSTREAM_URL}/'):
if not channel.startswith('nixos-') \
or channel.endswith('-small') \
or channel == 'nixos-unstable':
continue
DOWNLOAD_MATCH = r'nixos-\d\d.\d\d/latest-nixos-\w+-\w+-linux.\w+(.sha256)?'
if datetime.strptime(chan_updated, '%Y-%m-%d %H:%M') < CLONE_SINCE:
continue
chan_path = working_dir / channel
chan_path.mkdir(parents=True, exist_ok=True)
res = http_head(f'{UPSTREAM_URL}/{channel}', allow_redirects=False)
res.raise_for_status()
chan_location = res.headers['Location']
chan_release_basename = chan_location.split('/')[-1]
try:
last_url = (chan_path / '.last-url').read_text()
except (IOError, OSError):
last_url = 'not available'
if chan_location == last_url:
continue
logging.info(f'- {channel} -> {chan_release_basename}')
# Matches nixos-19.03 -> nixos-19.03beta171840.23fd1394dc6
# ^-------------^
if chan_release_basename.startswith(channel + 'beta'):
logging.info(f' - Beta channel, not updating')
continue
chan_info = get_channel(chan_location)
atomic_write_file(chan_path / '.released-time', chan_info['released_time'])
has_hash_fail = False
keep_files = { '.last-url', '.released-time' }
rename_files = []
logging.info(f' - Downloading new files')
chan_version = channel.split('-', 1)[1]
chan_release_version = chan_release_basename.split('-', 1)[1]
simplify_name = lambda fname: fname.replace(f'-{chan_release_version}-', f'-{chan_version}-')
image_files = [
(simplify_name(file_name), file_name, file_hash)
for file_name, _file_size, file_hash in chan_info['files']
if file_name.endswith('.iso') or file_name.endswith('ova')
object_names = [
x.object_name
for x in client.list_objects_v2('nix-channels', recursive=True)
if re.fullmatch(DOWNLOAD_MATCH, x.object_name)
]
for mirror_file_name, upstream_file_name, file_hash in image_files:
keep_files.add(mirror_file_name)
logging.info(f' - {upstream_file_name} -> {mirror_file_name}')
tmp_dest = f'.update.{upstream_file_name}'
rename_files.append((tmp_dest, mirror_file_name))
channels = defaultdict(lambda: [])
download(f'{chan_location}/{upstream_file_name}', chan_path / tmp_dest)
actual_hash = file_sha256(chan_path / tmp_dest)
for name in object_names:
chan, file = name.split('/', 1)
channels[chan].append(file)
if file_hash != actual_hash:
has_hash_fail = True
logging.error(f' - Incorrect hash')
logging.error(f' actual {actual_hash}')
logging.error(f' expected {file_hash}')
logging.info(f' - File saved as {tmp_dest}')
for channel, files in channels.items():
chan_dir = working_dir / channel
git_rev = http_get(get_url(f'{channel}/git-revision')).text
git_rev_path = chan_dir / 'git-revision'
if has_hash_fail:
logging.warn(f' - Found bad files. Will retry next time.')
if git_rev_path.exists() and git_rev == git_rev_path.read_text():
continue
logging.info(f'- {channel} -> {git_rev}')
for file in files:
logging.info(f' - {file}')
url = get_url(f'{channel}/{file}')
try:
download(url, chan_dir / file)
except requests.HTTPError as e:
if e.response.status_code == 404:
logging.info(f' - 404, skipped')
else:
logging.info(f' - Renaming files')
raise
for tmp_dest, mirror_file_name in rename_files:
(chan_path / tmp_dest).rename(chan_path / mirror_file_name)
logging.info(f' - Removing useless files')
for file_path in chan_path.iterdir():
file_name = file_path.name
if file_name not in keep_files:
logging.info(f' - {file_name}')
file_path.unlink()
logging.info(f' - Writing SHA256SUMS')
with (chan_path / 'SHA256SUMS').open('w') as f:
for mirror_file_name, _upstream_file_name, file_hash in image_files:
f.write(f'{file_hash} *{mirror_file_name}\n')
logging.info(f' - Update finished')
atomic_write_file(chan_path / '.last-url', chan_location)
atomic_write_file(git_rev_path, git_rev)
if __name__ == "__main__":
clone_images()