diff --git a/anaconda.py b/anaconda.py new file mode 100644 index 0000000..1981755 --- /dev/null +++ b/anaconda.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python3 +import os +import json +import hashlib +import tempfile +import shutil +import logging +import subprocess as sp +from pathlib import Path +from email.utils import parsedate_to_datetime + +import requests +from pyquery import PyQuery as pq + +DEFAULT_CONDA_REPO_BASE="https://repo.continuum.io" +DEFAULT_CONDA_CLOUD_BASE="https://conda.anaconda.org" + +CONDA_REPO_BASE_URL=os.getenv("CONDA_REPO_URL", "https://repo.continuum.io") +CONDA_CLOUD_BASE_URL=os.getenv("CONDA_COULD_URL", "https://conda.anaconda.org") + +WORKING_DIR = os.getenv("TUNASYNC_WORKING_DIR") + +CONDA_REPOS=("main", "free", "r", "mro", "pro") +CONDA_ARCHES=( + "noarch", "linux-64", "linux-32", "linux-armv6l", "linux-armv7l", + "linux-ppc64le", "osx-64", "osx-32", "win-64", "win-32" +) + +CONDA_CLOUD_REPOS=( + "conda-forge/linux-64", "conda-forge/osx-64", "conda-forge/win-64", "conda-forge/noarch", + "msys2/win-64", "msys2/noarch", + "bioconda/noarch", "bioconda/linux-64", "bioconda/osx-64", + "menpo/linux-64", "menpo/osx-64", "menpo/win-64", "menpo/win-32", "menpo/noarch", +) + +logging.basicConfig( + level=logging.INFO, + format="[%(asctime)s] [%(levelname)s] %(message)s", +) + + +def md5_check(file: Path, md5: str=None): + m = hashlib.md5() + with file.open('rb') as f: + while True: + buf = f.read(1*1024*1024) + if not buf: + break + m.update(buf) + return m.hexdigest() == md5 + + +def curl_download(remote_url: str, dst_file: Path, md5: str=None): + sp.check_call([ + "curl", "-o", str(dst_file), + "-sL", "--remote-time", "--show-error", + "--fail", remote_url, + ]) + if md5 and (not md5_check(dst_file, md5)): + return "MD5 mismatch" + + +def sync_repo(repo_url: str, local_dir: Path, tmpdir: Path): + logging.info("Start syncing {}".format(repo_url)) + local_dir.mkdir(parents=True, exist_ok=True) + + repodata_url = repo_url + '/repodata.json' + bz2_repodata_url = repo_url + '/repodata.json.bz2' + + tmp_repodata = tmpdir / "repodata.json" + tmp_bz2_repodata = tmpdir / "repodata.json.bz2" + + curl_download(repodata_url, tmp_repodata) + curl_download(bz2_repodata_url, tmp_bz2_repodata) + + with tmp_repodata.open() as f: + repodata = json.load(f) + + packages = repodata['packages'] + for filename, meta in packages.items(): + file_size, md5 = meta['size'], meta['md5'] + + pkg_url = '/'.join([repo_url, filename]) + dst_file = local_dir / filename + + if dst_file.is_file(): + stat = dst_file.stat() + local_filesize = stat.st_size + + if file_size == local_filesize: + logging.info("Skipping {}".format(filename)) + continue + + dst_file.unlink() + + for retry in range(3): + logging.info("Downloading {}".format(filename)) + err = curl_download(pkg_url, dst_file, md5=md5) + if err is None: + break + logging.error("Failed to download {}: {}".format(filename, err)) + + shutil.move(str(tmp_repodata), str(local_dir / "repodata.json")) + shutil.move(str(tmp_bz2_repodata), str(local_dir / "repodata.json.bz2")) + + +def sync_installer(repo_url, local_dir: Path): + logging.info("Start syncing {}".format(repo_url)) + local_dir.mkdir(parents=True, exist_ok=True) + + def remote_list(): + r = requests.get(repo_url) + d = pq(r.content) + for tr in d('table').find('tr'): + tds = pq(tr).find('td') + if len(tds) != 4: + continue + fname = tds[0].find('a').text + md5 = tds[3].text + yield (fname, md5) + + for filename, md5 in remote_list(): + pkg_url = "/".join([repo_url, filename]) + dst_file = local_dir / filename + + if dst_file.is_file(): + r = requests.head(pkg_url) + remote_filesize = int(r.headers['content-length']) + remote_date = parsedate_to_datetime(r.headers['last-modified']) + stat = dst_file.stat() + local_filesize = stat.st_size + local_mtime = stat.st_mtime + + if remote_filesize == local_filesize and remote_date.timestamp() == local_mtime: + logging.info("Skipping {}".format(filename)) + continue + + dst_file.unlink() + + for retry in range(3): + logging.info("Downloading {}".format(filename)) + err = curl_download(pkg_url, dst_file, md5=md5) + if err is None: + break + logging.error("Failed to download {}: {}".format(filename, err)) + + +def main(): + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--working-dir", default=WORKING_DIR) + args = parser.parse_args() + + if args.working_dir is None: + raise Exception("Working Directory is None") + + working_dir = Path(args.working_dir) + + for dist in ("archive", "miniconda"): + remote_url = "{}/{}".format(CONDA_REPO_BASE_URL, dist) + local_dir = working_dir / dist + try: + sync_installer(remote_url, local_dir) + except Exception: + logging.exception("Failed to sync installers of {}".format(dist)) + + for repo in CONDA_REPOS: + for arch in CONDA_ARCHES: + remote_url = "{}/pkgs/{}/{}".format(CONDA_REPO_BASE_URL, repo, arch) + local_dir = working_dir / "pkgs" / repo / arch + + tmpdir = tempfile.mkdtemp() + try: + sync_repo(remote_url, local_dir, Path(tmpdir)) + except Exception: + logging.exception("Failed to sync repo: {}/{}".format(repo, arch)) + finally: + shutil.rmtree(tmpdir) + + for repo in CONDA_CLOUD_REPOS: + remote_url = "{}/{}".format(CONDA_CLOUD_BASE_URL, repo) + local_dir = working_dir / "cloud" / repo + + tmpdir = tempfile.mkdtemp() + try: + sync_repo(remote_url, local_dir, Path(tmpdir)) + except Exception: + logging.exception("Failed to sync repo: {}/{}".format(repo, arch)) + finally: + shutil.rmtree(tmpdir) + + +if __name__ == "__main__": + main() + +# vim: ts=4 sw=4 sts=4 expandtab diff --git a/anaconda.sh b/anaconda.sh deleted file mode 100755 index de2ebcc..0000000 --- a/anaconda.sh +++ /dev/null @@ -1,185 +0,0 @@ -#!/bin/bash -# requires: wget, lftp, jq, python3.5, lxml, pyquery -set -e -set -u -set -o pipefail - -_here=`dirname $(realpath $0)` -HTMLPARSE="${_here}/helpers/anaconda-filelist.py" - -DEFAULT_CONDA_REPO_BASE="https://repo.continuum.io" -DEFAULT_CONDA_CLOUD_BASE="https://conda.anaconda.org" - -CONDA_REPO_BASE="${CONDA_REPO_BASE:-$DEFAULT_CONDA_REPO_BASE}" -CONDA_CLOUD_BASE="${CONDA_CLOUD_BASE:-$DEFAULT_CONDA_CLOUD_BASE}" - -LOCAL_DIR_BASE="${TUNASYNC_WORKING_DIR}" - -TMP_DIR=$(mktemp -d) - -CONDA_REPOS=("main" "free" "r" "mro" "pro") -CONDA_ARCHES=("noarch" "linux-64" "linux-32" "linux-armv6l" "linux-armv7l" "linux-ppc64le" "osx-64" "osx-32" "win-64" "win-32") - -CONDA_CLOUD_REPOS=( - "conda-forge/linux-64" "conda-forge/osx-64" "conda-forge/win-64" "conda-forge/noarch" - "msys2/win-64" "msys2/noarch" - "bioconda/noarch" "bioconda/linux-64" "bioconda/osx-64" - "menpo/linux-64" "menpo/osx-64" "menpo/win-64" "menpo/win-32" "menpo/noarch" -) - -EXIT_STATUS=0 -EXIT_MSG="" - -function check-and-download () { - remote_file=$1 - local_file=$2 - wget -q --spider ${remote_file} - if [ $? -eq 0 ]; then - echo "downloading ${remote_file}" - wget -q -O ${local_file} ${remote_file} - return - fi - return 1 -} - -function cleanup () { - echo "cleaning up" - [[ -d ${TMP_DIR} ]] && { - [[ -f ${TMP_DIR}/repodata.json ]] && rm ${TMP_DIR}/repodata.json - [[ -f ${TMP_DIR}/repodata.json.bz2 ]] && rm ${TMP_DIR}/repodata.json.bz2 - [[ -f ${TMP_DIR}/failed ]] && rm ${TMP_DIR}/failed - rmdir ${TMP_DIR} - } -} - -function download-with-checksum () { - local pkg_url=$1 - local dest_file=$2 - local pkgmd5=$3 - - local declare downloaded=false - local trials=0 - - while [[ $downloaded != true ]]; do - echo "downloading ${pkg_url}" - wget -q -O ${dest_file} ${pkg_url} && { - # two space for md5sum check format - { md5sum -c - < <(echo "${pkgmd5} ${dest_file}"); } && downloaded=true || trials=$((trials + 1)) - } || { - trials=$((trials + 1)) - } - if (( $trials > 3 )); then - return 1 - fi - done - return 0 -} - -trap cleanup EXIT - - -function sync_installer () { - repo_url="$1" - repo_dir="$2" - - [[ ! -d "$repo_dir" ]] && mkdir -p "$repo_dir" - cd $repo_dir - - while read -a tokens; do - fname=${tokens[0]} - pkgmd5=${tokens[2]} - - dest_file="${repo_dir}${fname}" - pkg_url="${repo_url}${fname}" - pkgsize=`curl --head -s ${pkg_url} | grep 'Content-Length' | awk '{print $2}' | tr -d '\r'` - - if [[ -f ${dest_file} ]]; then - rsize=`stat -c "%s" ${dest_file}` - if (( ${rsize} == ${pkgsize} )); then - echo "Skipping ${fname}, size ${pkgsize}" - continue - fi - fi - download-with-checksum ${pkg_url} ${dest_file} ${pkgmd5} || { - echo "Failed to download ${pkg_url}: checksum mismatch" - echo ${pkg_url} >> ${TMP_DIR}/failed - EXIT_STATUS=2 - EXIT_MSG="some files has bad checksum." - } - done < <(wget -O- ${repo_url} | $HTMLPARSE) -} - -function sync_repo () { - local repo_url="$1" - local local_dir="$2" - - [[ ! -d ${local_dir} ]] && mkdir -p ${local_dir} - - repodata_url="${repo_url}/repodata.json" - bz2_repodata_url="${repo_url}/repodata.json.bz2" - - tmp_repodata="${TMP_DIR}/repodata.json" - tmp_bz2_repodata="${TMP_DIR}/repodata.json.bz2" - - check-and-download ${repodata_url} ${tmp_repodata} - check-and-download ${bz2_repodata_url} ${tmp_bz2_repodata} - - jq_cmd='.packages | to_entries[] | [.key, .value.size, .value.md5] | map(tostring) | join(" ")' - - while read line; do - read -a tokens <<< $line - pkgfile=${tokens[0]} - pkgsize=${tokens[1]} - pkgmd5=${tokens[2]} - - pkg_url="${repo_url}/${pkgfile}" - dest_file="${local_dir}/${pkgfile}" - - if [[ -f ${dest_file} ]]; then - rsize=`stat -c "%s" ${dest_file}` - if (( ${rsize} == ${pkgsize} )); then - echo "Skipping ${pkgfile}, size ${pkgsize}" - continue - fi - fi - - download-with-checksum ${pkg_url} ${dest_file} ${pkgmd5} || { - echo "Failed to download ${pkg_url}: checksum mismatch" - echo ${pkg_url} >> ${TMP_DIR}/failed - EXIT_MSG="some files has bad checksum." - } - - done < <(bzip2 -c -d ${tmp_bz2_repodata} | jq -r "${jq_cmd}") - - mv -f "${TMP_DIR}/repodata.json" "${local_dir}/repodata.json" - mv -f "${TMP_DIR}/repodata.json.bz2" "${local_dir}/repodata.json.bz2" -} - -sync_installer "${CONDA_REPO_BASE}/archive/" "${LOCAL_DIR_BASE}/archive/" -sync_installer "${CONDA_REPO_BASE}/miniconda/" "${LOCAL_DIR_BASE}/miniconda/" - -for repo in ${CONDA_REPOS[@]}; do - for arch in ${CONDA_ARCHES[@]}; do - remote_url="${CONDA_REPO_BASE}/pkgs/$repo/$arch" - local_dir="${LOCAL_DIR_BASE}/pkgs/$repo/$arch" - - sync_repo "${remote_url}" "${local_dir}" || true - done -done - -for repo in ${CONDA_CLOUD_REPOS[@]}; do - remote_url="${CONDA_CLOUD_BASE}/${repo}" - local_dir="${LOCAL_DIR_BASE}/cloud/${repo}" - - sync_repo "${remote_url}" "${local_dir}" || true -done - - -[[ -f ${TMP_DIR}/failed ]] && { - echo "failed to download following packages:" - cat ${TMP_DIR}/failed - mv ${TMP_DIR}/failed ${LOCAL_DIR_BASE}/failed_packages.txt -} - -[[ -z $EXIT_MSG ]] || echo $EXIT_MSG -exit $EXIT_STATUS diff --git a/helpers/anaconda-filelist.py b/helpers/anaconda-filelist.py deleted file mode 100755 index bdcf06f..0000000 --- a/helpers/anaconda-filelist.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -from datetime import datetime -from pyquery import PyQuery as pq - - -def get_filelist(htmlstring): - d = pq(htmlstring) - for tr in d('table').find('tr'): - tds = pq(tr).find('td') - if len(tds) != 4: - continue - fname = tds[0].find('a').text - mdate = tds[2].text - md5 = tds[3].text - ts = datetime.strptime(mdate, "%Y-%m-%d %H:%M:%S").strftime("%s") - yield (fname, ts, md5) - - -if __name__ == "__main__": - import argparse - import fileinput - - parser = argparse.ArgumentParser() - parser.add_argument("htmlfile", nargs='?', default="-") - args = parser.parse_args() - - if args.htmlfile == "-": - htmlstring = '\n'.join([line for line in fileinput.input()]) - else: - with open(args.htmlfile) as f: - htmlstring = f.read() - - for file_record in get_filelist(htmlstring): - print("\t".join(file_record)) - - -# vim: ts=4 sw=4 sts=4 expandtab