diff --git a/README.md b/README.md index 51bf901..6e73066 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,64 @@ Shadowmire syncs PyPI (or plain HTTP(S) PyPI mirrors using Shadowmire) with a li ### Background -PyPI's XML-RPC Mirroring Support: +Bandersnatch is the recommended solution to sync from PyPI. However, it has these 2 issues that haven't been solved for a long time: + +- Bandersnatch does not support removing packages that have been removed from upstream, making it easier to be the target of supply chain attack. +- The upstream must implement [XML-RPC APIs](https://warehouse.pypa.io/api-reference/xml-rpc.html#mirroring-support), which is not acceptable for most mirror sites. + +Shadowmire is a light solution to these issues. + +### Syncing Protocol + +#### From PyPI + +PyPI's XML-RPC APIs have `list_packages_with_serial()` method to list ALL packages with "serial" (you could consider it as a version integer that just increases every few moments). `changelog_last_serial()` and `changelog_since_serial()` are NOT used as they could not handle package deletion. Local packages not in the list result are removed. + +Results from `list_packages_with_serial()` are stored in `remote.json`. `local.db` is a sqlite database which just stores every local package name and its local serial. `local.json` is dumped from `local.db` for downstream cosumption. + +#### From upstream using shadowmire + +Obviously, `list_packages_with_serial()`'s alternative is the `local.json`, which could be easily served by any HTTP server. Don't use `local.db`, as it could have consistency issues when shadowmire upstream is syncing. + +### How to use + +If you just need to fetch all indexes (and then use a cache solution for packages): + +```shell +REPO=/path/to/pypi ./shadowmire.py sync +``` + +If `REPO` env is not set, it defaults to current working directory. + +If you need to download all packages, add `--sync-packages`. + +```shell +./shadowmire.py sync --sync-packages +``` + +Sync command also supports `--exclude` -- you could give multiple regexes like this: + +```shell +./shadowmire.py sync --exclude package1 --exclude ^0 +``` + +And `--shadowmire-upstream`, if you don't want to sync from PyPI directly. + +```shell +./shadowmire.py sync --shadowmire-upstream http://example.com/pypi/ +``` + +If you already have a pypi repo, use `genlocal` first to generate a local db: + +```shell +./shadowmire.py genlocal +``` + +Verify command could be used if you believe that something is wrong. It would remove packages NOT in local db, update all local packages, and delete unreferenced files in `packages` folder: + +```shell +./shadowmire.py verify +``` ## Acknowledgements diff --git a/shadowmire.py b/shadowmire.py old mode 100644 new mode 100755 index c0df66d..1b0acb8 --- a/shadowmire.py +++ b/shadowmire.py @@ -344,8 +344,10 @@ class SyncBase: self.simple_dir.mkdir(parents=True, exist_ok=True) self.packages_dir.mkdir(parents=True, exist_ok=True) self.sync_packages = sync_packages - - def filter_remote_with_excludes(self, remote: dict[str, int], excludes: list[re.Pattern]) -> dict[str, int]: + + def filter_remote_with_excludes( + self, remote: dict[str, int], excludes: list[re.Pattern] + ) -> dict[str, int]: if not excludes: return remote res = {} @@ -359,7 +361,9 @@ class SyncBase: res[k] = v return res - def determine_sync_plan(self, local: dict[str, int], excludes: list[re.Pattern]) -> Plan: + def determine_sync_plan( + self, local: dict[str, int], excludes: list[re.Pattern] + ) -> Plan: remote = self.fetch_remote_versions() remote = self.filter_remote_with_excludes(remote, excludes) # store remote to remote.json @@ -635,13 +639,22 @@ def main(args: argparse.Namespace) -> None: logging.basicConfig(level=log_level) logger.debug(args) - basedir = Path(".") + basedir = Path(os.environ.get("REPO", ".")) local_db = LocalVersionKV(basedir / "local.db", basedir / "local.json") + sync: SyncBase if args.command == "sync": - sync = SyncPyPI( - basedir=basedir, local_db=local_db, sync_packages=args.sync_packages - ) + if args.shadowmire_upstream: + sync = SyncPlainHTTP( + upstream=args.shadowmire_upstream, + basedir=basedir, + local_db=local_db, + sync_packages=args.sync_packages, + ) + else: + sync = SyncPyPI( + basedir=basedir, local_db=local_db, sync_packages=args.sync_packages + ) local = local_db.dump() plan = sync.determine_sync_plan(local, args.excludes) # save plan for debugging @@ -660,11 +673,21 @@ def main(args: argparse.Namespace) -> None: local_db.batch_set(local) local_db.dump_json() elif args.command == "verify": - sync = SyncPyPI( - basedir=basedir, local_db=local_db, sync_packages=args.sync_packages - ) + if args.shadowmire_upstream: + sync = SyncPlainHTTP( + upstream=args.shadowmire_upstream, + basedir=basedir, + local_db=local_db, + sync_packages=args.sync_packages, + ) + else: + sync = SyncPyPI( + basedir=basedir, local_db=local_db, sync_packages=args.sync_packages + ) local_names = set(local_db.keys()) - simple_dirs = set([i.name for i in (basedir / "simple").iterdir() if i.is_dir()]) + simple_dirs = set( + [i.name for i in (basedir / "simple").iterdir() if i.is_dir()] + ) for package_name in simple_dirs - local_names: sync.do_remove(package_name) sync.parallel_update(list(local_names)) @@ -696,6 +719,11 @@ if __name__ == "__main__": parser_sync.add_argument( "--exclude", help="Remote package names to exclude. Regex.", nargs="*" ) + parser_sync.add_argument( + "--shadowmire-upstream", + help="Use another upstream using shadowmire instead of PyPI", + type=str, + ) parser_genlocal = subparsers.add_parser( "genlocal", help="(Re)generate local db and json from simple/" ) @@ -708,6 +736,11 @@ if __name__ == "__main__": help="Sync packages instead of just indexes", action="store_true", ) + parser_verify.add_argument( + "--shadowmire-upstream", + help="Use another upstream using shadowmire instead of PyPI", + type=str, + ) args = parser.parse_args() if args.command is None: