Merge branch 'dev'

2025-07-01 15:35:45 +00:00 · 2017-12-04 18:04:47 +08:00 · 2017-12-04 18:04:47 +08:00 · b36754688e
commit b36754688e
parent b02b66bff0 e0cf1e6d53
3 changed files with 196 additions and 222 deletions
--- a/anaconda.py
+++ b/anaconda.py
@ -0,0 +1,196 @@
+#!/usr/bin/env python3
+import os
+import json
+import hashlib
+import tempfile
+import shutil
+import logging
+import subprocess as sp
+from pathlib import Path
+from email.utils import parsedate_to_datetime
+
+import requests
+from pyquery import PyQuery as pq
+
+DEFAULT_CONDA_REPO_BASE="https://repo.continuum.io"
+DEFAULT_CONDA_CLOUD_BASE="https://conda.anaconda.org"
+
+CONDA_REPO_BASE_URL=os.getenv("CONDA_REPO_URL", "https://repo.continuum.io")
+CONDA_CLOUD_BASE_URL=os.getenv("CONDA_COULD_URL", "https://conda.anaconda.org")
+
+WORKING_DIR = os.getenv("TUNASYNC_WORKING_DIR")
+
+CONDA_REPOS=("main", "free", "r", "mro", "pro")
+CONDA_ARCHES=(
+    "noarch", "linux-64", "linux-32", "linux-armv6l", "linux-armv7l",
+    "linux-ppc64le", "osx-64", "osx-32", "win-64", "win-32"
+)
+
+CONDA_CLOUD_REPOS=(
+    "conda-forge/linux-64", "conda-forge/osx-64", "conda-forge/win-64", "conda-forge/noarch",
+    "msys2/win-64", "msys2/noarch",
+    "bioconda/noarch", "bioconda/linux-64", "bioconda/osx-64",
+    "menpo/linux-64", "menpo/osx-64", "menpo/win-64", "menpo/win-32", "menpo/noarch",
+)
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="[%(asctime)s] [%(levelname)s] %(message)s",
+)
+
+
+def md5_check(file: Path, md5: str=None):
+    m = hashlib.md5()
+    with file.open('rb') as f:
+        while True:
+            buf = f.read(1*1024*1024)
+            if not buf:
+                break
+            m.update(buf)
+    return m.hexdigest() == md5
+
+
+def curl_download(remote_url: str, dst_file: Path, md5: str=None):
+    sp.check_call([
+        "curl", "-o", str(dst_file),
+        "-sL", "--remote-time", "--show-error",
+        "--fail", remote_url,
+    ])
+    if md5 and (not md5_check(dst_file, md5)):
+        return "MD5 mismatch"
+
+
+def sync_repo(repo_url: str, local_dir: Path, tmpdir: Path):
+    logging.info("Start syncing {}".format(repo_url))
+    local_dir.mkdir(parents=True, exist_ok=True)
+
+    repodata_url = repo_url + '/repodata.json'
+    bz2_repodata_url = repo_url + '/repodata.json.bz2'
+
+    tmp_repodata = tmpdir / "repodata.json"
+    tmp_bz2_repodata = tmpdir / "repodata.json.bz2"
+
+    curl_download(repodata_url, tmp_repodata)
+    curl_download(bz2_repodata_url, tmp_bz2_repodata)
+
+    with tmp_repodata.open() as f:
+        repodata = json.load(f)
+
+    packages = repodata['packages']
+    for filename, meta in packages.items():
+        file_size, md5 = meta['size'], meta['md5']
+
+        pkg_url = '/'.join([repo_url, filename])
+        dst_file = local_dir / filename
+
+        if dst_file.is_file():
+            stat = dst_file.stat()
+            local_filesize = stat.st_size
+
+            if file_size == local_filesize:
+                logging.info("Skipping {}".format(filename))
+                continue
+
+            dst_file.unlink()
+
+        for retry in range(3):
+            logging.info("Downloading {}".format(filename))
+            err = curl_download(pkg_url, dst_file, md5=md5)
+            if err is None:
+                break
+            logging.error("Failed to download {}: {}".format(filename, err))
+
+    shutil.move(str(tmp_repodata), str(local_dir / "repodata.json"))
+    shutil.move(str(tmp_bz2_repodata), str(local_dir / "repodata.json.bz2"))
+
+
+def sync_installer(repo_url, local_dir: Path):
+    logging.info("Start syncing {}".format(repo_url))
+    local_dir.mkdir(parents=True, exist_ok=True)
+
+    def remote_list():
+        r = requests.get(repo_url)
+        d = pq(r.content)
+        for tr in d('table').find('tr'):
+            tds = pq(tr).find('td')
+            if len(tds) != 4:
+                continue
+            fname = tds[0].find('a').text
+            md5 = tds[3].text
+            yield (fname, md5)
+
+    for filename, md5 in remote_list():
+        pkg_url = "/".join([repo_url, filename])
+        dst_file = local_dir / filename
+
+        if dst_file.is_file():
+            r = requests.head(pkg_url)
+            remote_filesize = int(r.headers['content-length'])
+            remote_date = parsedate_to_datetime(r.headers['last-modified'])
+            stat = dst_file.stat()
+            local_filesize = stat.st_size
+            local_mtime = stat.st_mtime
+
+            if remote_filesize == local_filesize and remote_date.timestamp() == local_mtime:
+                logging.info("Skipping {}".format(filename))
+                continue
+
+            dst_file.unlink()
+
+        for retry in range(3):
+            logging.info("Downloading {}".format(filename))
+            err = curl_download(pkg_url, dst_file, md5=md5)
+            if err is None:
+                break
+            logging.error("Failed to download {}: {}".format(filename, err))
+
+
+def main():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--working-dir", default=WORKING_DIR)
+    args = parser.parse_args()
+
+    if args.working_dir is None:
+        raise Exception("Working Directory is None")
+
+    working_dir = Path(args.working_dir)
+
+    for dist in ("archive", "miniconda"):
+        remote_url = "{}/{}".format(CONDA_REPO_BASE_URL, dist)
+        local_dir = working_dir / dist
+        try:
+            sync_installer(remote_url, local_dir)
+        except Exception:
+            logging.exception("Failed to sync installers of {}".format(dist))
+
+    for repo in CONDA_REPOS:
+        for arch in CONDA_ARCHES:
+            remote_url = "{}/pkgs/{}/{}".format(CONDA_REPO_BASE_URL, repo, arch)
+            local_dir = working_dir / "pkgs" / repo / arch
+
+            tmpdir = tempfile.mkdtemp()
+            try:
+                sync_repo(remote_url, local_dir, Path(tmpdir))
+            except Exception:
+                logging.exception("Failed to sync repo: {}/{}".format(repo, arch))
+            finally:
+                shutil.rmtree(tmpdir)
+
+    for repo in CONDA_CLOUD_REPOS:
+        remote_url = "{}/{}".format(CONDA_CLOUD_BASE_URL, repo)
+        local_dir = working_dir / "cloud" / repo
+
+        tmpdir = tempfile.mkdtemp()
+        try:
+            sync_repo(remote_url, local_dir, Path(tmpdir))
+        except Exception:
+            logging.exception("Failed to sync repo: {}/{}".format(repo, arch))
+        finally:
+            shutil.rmtree(tmpdir)
+
+
+if __name__ == "__main__":
+    main()
+
+# vim: ts=4 sw=4 sts=4 expandtab
--- a/anaconda.sh
+++ b/anaconda.sh
@ -1,185 +0,0 @@
-#!/bin/bash 
-# requires: wget, lftp, jq, python3.5, lxml, pyquery 
-set -e 
-set -u 
-set -o pipefail
-
-_here=`dirname $(realpath $0)`
-HTMLPARSE="${_here}/helpers/anaconda-filelist.py"
-
-DEFAULT_CONDA_REPO_BASE="https://repo.continuum.io"
-DEFAULT_CONDA_CLOUD_BASE="https://conda.anaconda.org"
-
-CONDA_REPO_BASE="${CONDA_REPO_BASE:-$DEFAULT_CONDA_REPO_BASE}"
-CONDA_CLOUD_BASE="${CONDA_CLOUD_BASE:-$DEFAULT_CONDA_CLOUD_BASE}"
-
-LOCAL_DIR_BASE="${TUNASYNC_WORKING_DIR}"
-
-TMP_DIR=$(mktemp -d)
-
-CONDA_REPOS=("main" "free" "r" "mro" "pro")
-CONDA_ARCHES=("noarch" "linux-64" "linux-32" "linux-armv6l" "linux-armv7l" "linux-ppc64le" "osx-64" "osx-32" "win-64" "win-32")
-
-CONDA_CLOUD_REPOS=(
-	"conda-forge/linux-64" "conda-forge/osx-64" "conda-forge/win-64" "conda-forge/noarch"
-	"msys2/win-64" "msys2/noarch"
-	"bioconda/noarch" "bioconda/linux-64" "bioconda/osx-64"
-	"menpo/linux-64" "menpo/osx-64" "menpo/win-64" "menpo/win-32" "menpo/noarch"
-)
-
-EXIT_STATUS=0
-EXIT_MSG=""
-
-function check-and-download () {
-	remote_file=$1
-	local_file=$2
-	wget -q --spider ${remote_file}
-	if [ $? -eq 0 ]; then
-		echo "downloading ${remote_file}"
-		wget -q -O ${local_file} ${remote_file}
-		return
-	fi
-	return 1
-}
-
-function cleanup () {
-	echo "cleaning up"
-	[[ -d ${TMP_DIR} ]] && {
-		[[ -f ${TMP_DIR}/repodata.json ]] && rm ${TMP_DIR}/repodata.json
-		[[ -f ${TMP_DIR}/repodata.json.bz2 ]] && rm ${TMP_DIR}/repodata.json.bz2
-		[[ -f ${TMP_DIR}/failed ]] && rm ${TMP_DIR}/failed
-		rmdir ${TMP_DIR}
-	}
-}
-
-function download-with-checksum () {
-	local pkg_url=$1
-	local dest_file=$2
-	local pkgmd5=$3
-
-	local declare downloaded=false
-	local trials=0
-
-	while [[ $downloaded != true ]]; do
-		echo "downloading ${pkg_url}"
-		wget -q -O ${dest_file} ${pkg_url} && {
-			# two space for md5sum check format
-			{ md5sum -c - < <(echo "${pkgmd5} ${dest_file}"); } && downloaded=true || trials=$((trials + 1))
-		} || {
-			trials=$((trials + 1))
-		}
-		if (( $trials > 3 )); then
-			return 1
-		fi
-	done
-	return 0
-}
-
-trap cleanup EXIT
-
-
-function sync_installer () {
-	repo_url="$1"
-	repo_dir="$2"
-
-	[[ ! -d "$repo_dir" ]] && mkdir -p "$repo_dir"
-	cd $repo_dir
-	
-	while read -a tokens; do
-		fname=${tokens[0]}
-		pkgmd5=${tokens[2]}
-
-		dest_file="${repo_dir}${fname}"
-		pkg_url="${repo_url}${fname}"
-		pkgsize=`curl --head -s ${pkg_url} | grep 'Content-Length' | awk '{print $2}' | tr -d '\r'`
-		
-		if [[ -f ${dest_file} ]]; then
-			rsize=`stat -c "%s" ${dest_file}`
-			if (( ${rsize} == ${pkgsize} )); then
-				echo "Skipping ${fname}, size ${pkgsize}"
-				continue
-			fi
-		fi
-		download-with-checksum ${pkg_url} ${dest_file} ${pkgmd5} || {
-			echo "Failed to download ${pkg_url}: checksum mismatch"
-			echo ${pkg_url} >> ${TMP_DIR}/failed
-			EXIT_STATUS=2
-			EXIT_MSG="some files has bad checksum."
-		}
-	done < <(wget -O- ${repo_url} | $HTMLPARSE)
-}
-
-function sync_repo () {
-	local repo_url="$1"
-	local local_dir="$2"
-	
-	[[ ! -d ${local_dir} ]] && mkdir -p ${local_dir}
-	
-	repodata_url="${repo_url}/repodata.json"
-	bz2_repodata_url="${repo_url}/repodata.json.bz2"
-
-	tmp_repodata="${TMP_DIR}/repodata.json"
-	tmp_bz2_repodata="${TMP_DIR}/repodata.json.bz2"
-
-	check-and-download ${repodata_url} ${tmp_repodata}
-	check-and-download ${bz2_repodata_url} ${tmp_bz2_repodata}
-
-	jq_cmd='.packages | to_entries[] | [.key, .value.size, .value.md5] | map(tostring) | join(" ")'
-
-	while read line; do
-		read -a tokens <<< $line
-		pkgfile=${tokens[0]}
-		pkgsize=${tokens[1]}
-		pkgmd5=${tokens[2]}
-		
-		pkg_url="${repo_url}/${pkgfile}"
-		dest_file="${local_dir}/${pkgfile}"
-		
-		if [[ -f ${dest_file} ]]; then
-			rsize=`stat -c "%s" ${dest_file}`
-			if (( ${rsize} == ${pkgsize} )); then
-				echo "Skipping ${pkgfile}, size ${pkgsize}"
-				continue
-			fi
-		fi
-
-		download-with-checksum ${pkg_url} ${dest_file} ${pkgmd5} || {
-			echo "Failed to download ${pkg_url}: checksum mismatch"
-			echo ${pkg_url} >> ${TMP_DIR}/failed
-			EXIT_MSG="some files has bad checksum."
-		}
-
-	done < <(bzip2 -c -d ${tmp_bz2_repodata} | jq -r "${jq_cmd}")
-	
-	mv -f "${TMP_DIR}/repodata.json" "${local_dir}/repodata.json"
-	mv -f "${TMP_DIR}/repodata.json.bz2" "${local_dir}/repodata.json.bz2"
-}
-
-sync_installer "${CONDA_REPO_BASE}/archive/" "${LOCAL_DIR_BASE}/archive/"
-sync_installer "${CONDA_REPO_BASE}/miniconda/" "${LOCAL_DIR_BASE}/miniconda/"
-
-for repo in ${CONDA_REPOS[@]}; do
-	for arch in ${CONDA_ARCHES[@]}; do
-		remote_url="${CONDA_REPO_BASE}/pkgs/$repo/$arch"
-		local_dir="${LOCAL_DIR_BASE}/pkgs/$repo/$arch"
-
-		sync_repo "${remote_url}" "${local_dir}" || true
-	done
-done
-
-for repo in ${CONDA_CLOUD_REPOS[@]}; do
-	remote_url="${CONDA_CLOUD_BASE}/${repo}"
-	local_dir="${LOCAL_DIR_BASE}/cloud/${repo}"
-
-	sync_repo "${remote_url}" "${local_dir}" || true
-done
-
-
-[[ -f ${TMP_DIR}/failed ]] && {
-	echo "failed to download following packages:"
-	cat ${TMP_DIR}/failed
-	mv ${TMP_DIR}/failed ${LOCAL_DIR_BASE}/failed_packages.txt
-}
-
-[[ -z $EXIT_MSG ]] || echo $EXIT_MSG
-exit $EXIT_STATUS
--- a/helpers/anaconda-filelist.py
+++ b/helpers/anaconda-filelist.py
@ -1,37 +0,0 @@
-#!/usr/bin/env python3
-from datetime import datetime
-from pyquery import PyQuery as pq
-
-
-def get_filelist(htmlstring):
-    d = pq(htmlstring)
-    for tr in d('table').find('tr'):
-        tds = pq(tr).find('td')
-        if len(tds) != 4:
-            continue
-        fname = tds[0].find('a').text
-        mdate = tds[2].text
-        md5 = tds[3].text
-        ts = datetime.strptime(mdate, "%Y-%m-%d %H:%M:%S").strftime("%s")
-        yield (fname, ts, md5)
-
-
-if __name__ == "__main__":
-    import argparse
-    import fileinput
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("htmlfile", nargs='?', default="-")
-    args = parser.parse_args()
-
-    if args.htmlfile == "-":
-        htmlstring = '\n'.join([line for line in fileinput.input()])
-    else:
-        with open(args.htmlfile) as f:
-            htmlstring = f.read()
-
-    for file_record in get_filelist(htmlstring):
-        print("\t".join(file_record))
-
-
-# vim: ts=4 sw=4 sts=4 expandtab