From d6f4e5b3e113b56424954ec4379467e4e45d895a Mon Sep 17 00:00:00 2001 From: Maksim Smolin Date: Fri, 17 Apr 2020 13:33:55 -0700 Subject: [PATCH] [SGD] Imagenet example (basic) (#8020) * Checkpoint the image-models example * Update cluster definition * Fix copyright info * Use original args * Checkpoint fixes * Add README * Add some missing features * Format * Get rid of the unused Namespace class * Address comments * Link the imagenet example in docs * Cleanup * Fix lint --- doc/source/raysgd/raysgd_pytorch.rst | 3 + .../torch/examples/image_models/.gitignore | 2 + .../examples/image_models/LICENSE_THIRDPARTY | 202 ++++++++ .../sgd/torch/examples/image_models/README.md | 48 ++ .../torch/examples/image_models/__init__.py | 0 .../sgd/torch/examples/image_models/args.py | 477 ++++++++++++++++++ .../torch/examples/image_models/cluster.yaml | 102 ++++ .../torch/examples/image_models/get_data.sh | 4 + .../sgd/torch/examples/image_models/train.py | 151 ++++++ 9 files changed, 989 insertions(+) create mode 100644 python/ray/util/sgd/torch/examples/image_models/.gitignore create mode 100644 python/ray/util/sgd/torch/examples/image_models/LICENSE_THIRDPARTY create mode 100644 python/ray/util/sgd/torch/examples/image_models/README.md create mode 100644 python/ray/util/sgd/torch/examples/image_models/__init__.py create mode 100644 python/ray/util/sgd/torch/examples/image_models/args.py create mode 100644 python/ray/util/sgd/torch/examples/image_models/cluster.yaml create mode 100755 python/ray/util/sgd/torch/examples/image_models/get_data.sh create mode 100644 python/ray/util/sgd/torch/examples/image_models/train.py diff --git a/doc/source/raysgd/raysgd_pytorch.rst b/doc/source/raysgd/raysgd_pytorch.rst index f50f00e04..5d85b27d6 100644 --- a/doc/source/raysgd/raysgd_pytorch.rst +++ b/doc/source/raysgd/raysgd_pytorch.rst @@ -717,6 +717,9 @@ to contribute an example, feel free to create a `pull request here `__: Fine-tuning a ResNet50 model on VOC with Batch Norm. +- `ImageNet Models example `__: + Training state-of-the-art ImageNet models. + - `CIFAR10 example `__: Training a ResNet18 model on CIFAR10. diff --git a/python/ray/util/sgd/torch/examples/image_models/.gitignore b/python/ray/util/sgd/torch/examples/image_models/.gitignore new file mode 100644 index 000000000..663d6c715 --- /dev/null +++ b/python/ray/util/sgd/torch/examples/image_models/.gitignore @@ -0,0 +1,2 @@ +imagenette2.tgz +/data diff --git a/python/ray/util/sgd/torch/examples/image_models/LICENSE_THIRDPARTY b/python/ray/util/sgd/torch/examples/image_models/LICENSE_THIRDPARTY new file mode 100644 index 000000000..6220663bf --- /dev/null +++ b/python/ray/util/sgd/torch/examples/image_models/LICENSE_THIRDPARTY @@ -0,0 +1,202 @@ +Components of timm (args.py): + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2019 Ross Wightman + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/python/ray/util/sgd/torch/examples/image_models/README.md b/python/ray/util/sgd/torch/examples/image_models/README.md new file mode 100644 index 000000000..81e0bf01b --- /dev/null +++ b/python/ray/util/sgd/torch/examples/image_models/README.md @@ -0,0 +1,48 @@ +# Imagenet Models RaySGD Example + +Based on [the timm package](https://github.com/rwightman/pytorch-image-models). + +# Usage + +## Ray autoscaler + +- `ray up cluster.yaml` +- `ray rsync-up -A cluster.yaml` +- `ray submit train.py -- data -n=4` + +## Manual + +- Make `train.py` and `args.py` available on the remote host. +- `pip install timm` +- Download and unpack an ImageNet-compatible dataset (has to be full size). Internally we use [Imagenette](https://github.com/fastai/imagenette) for development purposes. +- Optional: setup a ray cluster (`ray start --head` on the head node and `ray start --redis-address HEAD_ADDRESS` on each of the worker nodes). +- Run `python train.py DATA_DIRECTORY` on the head node. + +## Manual (single node) + +- `pip install timm` +- `ray start --head` +- `python train.py DATA_DIRECTORY` +- Use the `-n` argument to control the number of processes + GPUs used. + +# Advantages + +Compared to the original `timm` package, the RaySGD train script has a few advantages: + +- Compatibility with Ray autoscaler (automatic simple cluster provisioning). +- Built-in fault tolerance (epochs will checkpoint and restart if a worker fails). This means you can, for example, make all your worker nodes preemtible (e.g. AWS spot requests). **Note:** the head node *must* be non-preemtible. +- Since a Ray cluster is already setup, you can run other distributed tasks. + +# Limitations + +Support for some command line flags from the original `timm` package has been intentionally dropped: +- `-j/--workers` - Ray can start multiple training processes with `-n/--num-workers` instead. +- `--num-gpu` - Ray can use multiple GPUs by launching multiple processes with `-n/--num-workers` instead. (`DistributedDataParallel` is faster than simple `DataParallel` in practice anyway) + +Other features are still in the works: +- Logging +- Compatibility with timm checkpoints +- EMA +- Sync batch norm +- Learning rate scheduling +- Some testing diff --git a/python/ray/util/sgd/torch/examples/image_models/__init__.py b/python/ray/util/sgd/torch/examples/image_models/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/python/ray/util/sgd/torch/examples/image_models/args.py b/python/ray/util/sgd/torch/examples/image_models/args.py new file mode 100644 index 000000000..c6fe0fca0 --- /dev/null +++ b/python/ray/util/sgd/torch/examples/image_models/args.py @@ -0,0 +1,477 @@ +# Modified by Maksim Smolin in 2020 +# +# Original work by Ross Wightman as part of the timm package +# (see LICENSE_THIRDPARTY) + +# Note: other authors MUST include themselves in the above copyright notice +# in order to abide by the terms of the Apache license + +import logging +import argparse +import yaml + +config_parser = parser = argparse.ArgumentParser( + description="Training Config", add_help=False) +parser.add_argument( + "-c", + "--config", + default="", + type=str, + metavar="FILE", + help="YAML config file specifying default arguments") + +parser = argparse.ArgumentParser(description="PyTorch ImageNet Training") +# Dataset / Model parameters +parser.add_argument("data", metavar="DIR", help="path to dataset") +parser.add_argument( + "--model", + default="resnet101", + type=str, + metavar="MODEL", + help="Name of model to train (default: 'countception'") +parser.add_argument( + "--pretrained", + action="store_true", + default=False, + help="Start with pretrained version of specified network (if avail)") +parser.add_argument( + "--initial-checkpoint", + default="", + type=str, + metavar="PATH", + help="Initialize model from this checkpoint (default: none)") +parser.add_argument( + "--resume", + default="", + type=str, + metavar="PATH", + help="Resume full model and optimizer state from checkpoint " + "(default: none)") +parser.add_argument( + "--no-resume-opt", + action="store_true", + default=False, + help="prevent resume of optimizer state when resuming model") +parser.add_argument( + "--num-classes", + type=int, + default=1000, + metavar="N", + help="number of label classes (default: 1000)") +parser.add_argument( + "--gp", + default="avg", + type=str, + metavar="POOL", + help=("Type of global pool, 'avg', 'max', 'avgmax', 'avgmaxc' " + "(default: 'avg')")) +parser.add_argument( + "--img-size", + type=int, + default=None, + metavar="N", + help="Image patch size (default: None => model default)") +parser.add_argument( + "--crop-pct", + default=None, + type=float, + metavar="N", + help="Input image center crop percent (for validation only)") +parser.add_argument( + "--mean", + type=float, + nargs="+", + default=None, + metavar="MEAN", + help="Override mean pixel value of dataset") +parser.add_argument( + "--std", + type=float, + nargs="+", + default=None, + metavar="STD", + help="Override std deviation of of dataset") +parser.add_argument( + "--interpolation", + default="", + type=str, + metavar="NAME", + help="Image resize interpolation type (overrides model)") +parser.add_argument( + "-b", + "--batch-size", + type=int, + default=32, + metavar="N", + help="input batch size for training (default: 32)") +parser.add_argument( + "-vb", + "--validation-batch-size-multiplier", + type=int, + default=1, + metavar="N", + help="ratio of validation batch size to training batch size (default: 1)") +parser.add_argument( + "--drop", + type=float, + default=0.0, + metavar="PCT", + help="Dropout rate (default: 0.)") +parser.add_argument( + "--drop-connect", + type=float, + default=None, + metavar="PCT", + help="Drop connect rate, DEPRECATED, use drop-path (default: None)") +parser.add_argument( + "--drop-path", + type=float, + default=None, + metavar="PCT", + help="Drop path rate (default: None)") +parser.add_argument( + "--drop-block", + type=float, + default=None, + metavar="PCT", + help="Drop block rate (default: None)") +parser.add_argument( + "--jsd", + action="store_true", + default=False, + help="Enable Jensen-Shannon Divergence + CE loss. Use with `--aug-splits`." +) +# Optimizer parameters +parser.add_argument( + "--opt", + default="sgd", + type=str, + metavar="OPTIMIZER", + help="Optimizer (default: 'sgd'") +parser.add_argument( + "--opt-eps", + default=1e-8, + type=float, + metavar="EPSILON", + help="Optimizer Epsilon (default: 1e-8)") +parser.add_argument( + "--momentum", + type=float, + default=0.9, + metavar="M", + help="SGD momentum (default: 0.9)") +parser.add_argument( + "--weight-decay", + type=float, + default=0.0001, + help="weight decay (default: 0.0001)") +# Learning rate schedule parameters +parser.add_argument( + "--sched", + default="step", + type=str, + metavar="SCHEDULER", + help="LR scheduler (default: 'step'") +parser.add_argument( + "--lr", + type=float, + default=0.01, + metavar="LR", + help="learning rate (default: 0.01)") +parser.add_argument( + "--lr-noise", + type=float, + nargs="+", + default=None, + metavar="pct, pct", + help="learning rate noise on/off epoch percentages") +parser.add_argument( + "--lr-noise-pct", + type=float, + default=0.67, + metavar="PERCENT", + help="learning rate noise limit percent (default: 0.67)") +parser.add_argument( + "--lr-noise-std", + type=float, + default=1.0, + metavar="STDDEV", + help="learning rate noise std-dev (default: 1.0)") +parser.add_argument( + "--warmup-lr", + type=float, + default=0.0001, + metavar="LR", + help="warmup learning rate (default: 0.0001)") +parser.add_argument( + "--min-lr", + type=float, + default=1e-5, + metavar="LR", + help="lower lr bound for cyclic schedulers that hit 0 (1e-5)") +parser.add_argument( + "--epochs", + type=int, + default=200, + metavar="N", + help="number of epochs to train (default: 2)") +parser.add_argument( + "--start-epoch", + default=None, + type=int, + metavar="N", + help="manual epoch number (useful on restarts)") +parser.add_argument( + "--decay-epochs", + type=float, + default=30, + metavar="N", + help="epoch interval to decay LR") +parser.add_argument( + "--warmup-epochs", + type=int, + default=3, + metavar="N", + help="epochs to warmup LR, if scheduler supports") +parser.add_argument( + "--cooldown-epochs", + type=int, + default=10, + metavar="N", + help="epochs to cooldown LR at min_lr, after cyclic schedule ends") +parser.add_argument( + "--patience-epochs", + type=int, + default=10, + metavar="N", + help="patience epochs for Plateau LR scheduler (default: 10") +parser.add_argument( + "--decay-rate", + "--dr", + type=float, + default=0.1, + metavar="RATE", + help="LR decay rate (default: 0.1)") +# Augmentation parameters +parser.add_argument( + "--color-jitter", + type=float, + default=0.4, + metavar="PCT", + help="Color jitter factor (default: 0.4)") +parser.add_argument( + "--aa", + type=str, + default=None, + metavar="NAME", + help="Use AutoAugment policy. 'v0' or 'original'. (default: None)"), +# parser.add_argument( +# "--aug-splits", +# type=int, +# default=0, +# help="Number of augmentation splits (default: 0, valid: 0 or >=2)") +parser.add_argument( + "--reprob", + type=float, + default=0., + metavar="PCT", + help="Random erase prob (default: 0.)") +parser.add_argument( + "--remode", + type=str, + default="const", + help="Random erase mode (default: 'const')") +parser.add_argument( + "--recount", type=int, default=1, help="Random erase count (default: 1)") +parser.add_argument( + "--resplit", + action="store_true", + default=False, + help="Do not random erase first (clean) augmentation split") +parser.add_argument( + "--mixup", + type=float, + default=0.0, + help="mixup alpha, mixup enabled if > 0. (default: 0.)") +parser.add_argument( + "--mixup-off-epoch", + default=0, + type=int, + metavar="N", + help="turn off mixup after this epoch, disabled if 0 (default: 0)") +parser.add_argument( + "--smoothing", + type=float, + default=0.1, + help="label smoothing (default: 0.1)") +parser.add_argument( + "--train-interpolation", + type=str, + default="random", + help="Training interpolation (random, bilinear, bicubic default: 'random')" +) +# Batch norm parameters +# (only works with gen_efficientnet based models currently) +parser.add_argument( + "--bn-tf", + action="store_true", + default=False, + help="Use Tensorflow BatchNorm defaults for models that support it " + "(default: False)") +parser.add_argument( + "--bn-momentum", + type=float, + default=None, + help="BatchNorm momentum override (if not None)") +parser.add_argument( + "--bn-eps", + type=float, + default=None, + help="BatchNorm epsilon override (if not None)") +parser.add_argument( + "--sync-bn", + action="store_true", + help="Enable NVIDIA Apex or Torch synchronized BatchNorm.") +parser.add_argument( + "--dist-bn", + type=str, + default="", + help=("Distribute BatchNorm stats between nodes after each epoch " + "('broadcast', 'reduce', or '')")) +# parser.add_argument( +# "--split-bn", +# action="store_true", +# help="Enable separate BN layers per augmentation split.") +# Model Exponential Moving Average +parser.add_argument( + "--model-ema", + action="store_true", + default=False, + help="Enable tracking moving average of model weights") +parser.add_argument( + "--model-ema-force-cpu", + action="store_true", + default=False, + help="Force ema to be tracked on CPU, rank=0 node only. " + "Disables EMA validation.") +parser.add_argument( + "--model-ema-decay", + type=float, + default=0.9998, + help="decay factor for model weights moving average (default: 0.9998)") +# Misc +parser.add_argument( + "--seed", + type=int, + default=42, + metavar="S", + help="random seed (default: 42)") +parser.add_argument( + "--log-interval", + type=int, + default=50, + metavar="N", + help="how many batches to wait before logging training status") +parser.add_argument( + "--recovery-interval", + type=int, + default=0, + metavar="N", + help="how many batches to wait before writing recovery checkpoint") +parser.add_argument( + "--no-gpu", + action="store_true", + default=False, + help="do not use a GPU even if available") +parser.add_argument( + "--save-images", + action="store_true", + default=False, + help="save images of input bathes every log interval for debugging") +parser.add_argument( + "--amp", + action="store_true", + default=False, + help="use NVIDIA amp for mixed precision training") +parser.add_argument( + "--pin-mem", + action="store_true", + default=False, + help="Pin CPU memory in DataLoader for more efficient (sometimes) " + "transfer to GPU.") +parser.add_argument( + "--no-prefetcher", + action="store_true", + default=False, + help="disable fast prefetcher") +parser.add_argument( + "--output", + default="", + type=str, + metavar="PATH", + help="path to output folder (default: none, current dir)") +parser.add_argument( + "--eval-metric", + default="prec1", + type=str, + metavar="EVAL_METRIC", + help="Best metric (default: 'prec1'") +parser.add_argument( + "--tta", + type=int, + default=0, + metavar="N", + help="Test/inference time augmentation (oversampling) factor. 0=None " + "(default: 0)") +parser.add_argument("--local_rank", default=0, type=int) + +# ray +parser.add_argument( + "--ray-address", + default="auto", + metavar="ADDR", + help="Ray cluster address. [default=auto]") +parser.add_argument( + "-n", + "--ray-num-workers", + type=int, + default=1, + metavar="N", + help="Number of Ray replicas to use. [default=1]") + + +def parse_args(): + # Do we have a config file to parse? + args_config, remaining = config_parser.parse_known_args() + if args_config.config: + with open(args_config.config, "r") as f: + cfg = yaml.safe_load(f) + parser.set_defaults(**cfg) + + # The main arg parser parses the rest of the args, the usual + # defaults will have been overridden if config file specified. + args = parser.parse_args(remaining) + + # Cache the args as a text string to save them in the output dir later + args_text = yaml.safe_dump(args.__dict__, default_flow_style=False) + + # Arguments pre-processing from the original train.py + args.prefetcher = not args.no_prefetcher + args.distributed = False # ray SGD handles this (DistributedSampler) + args.device = "cuda" # ray should handle this + + if args.no_gpu == 0 and args.prefetcher: + logging.warning("Prefetcher needs CUDA currently " + "(might be a bug in timm). " + "Disabling it.") + args.prefetcher = False + + # assert args.aug_splits == 0 or args.aug_splits > 1, ( + # "Split must be 0 or 2+") + + # args.num_aug_splits = args.aug_splits + args.num_aug_splits = 0 # todo: + + args.split_bn = False # todo: + + return args, args_text diff --git a/python/ray/util/sgd/torch/examples/image_models/cluster.yaml b/python/ray/util/sgd/torch/examples/image_models/cluster.yaml new file mode 100644 index 000000000..d1e099341 --- /dev/null +++ b/python/ray/util/sgd/torch/examples/image_models/cluster.yaml @@ -0,0 +1,102 @@ +# An unique identifier for the head node and workers of this cluster. +cluster_name: sgd-pytorch-imagenet + +# The maximum number of workers nodes to launch in addition to the head +# node. This takes precedence over min_workers. min_workers default to 0. +min_workers: 1 +initial_workers: 1 +max_workers: 1 + +target_utilization_fraction: 0.9 + +# If a node is idle for this many minutes, it will be removed. +idle_timeout_minutes: 10 +# docker: +# image: tensorflow/tensorflow:1.5.0-py3 +# container_name: ray_docker + +# Cloud-provider specific configuration. +provider: + type: aws + region: us-east-1 + availability_zone: us-east-1c + +# How Ray will authenticate with newly launched nodes. +auth: + ssh_user: ubuntu + # ssh_private_key: ... + +head_node: + InstanceType: p3.2xlarge + ImageId: ami-0698bcaf8bd9ef56d + # KeyName: ... + InstanceMarketOptions: + MarketType: spot + SpotOptions: + BlockDurationMinutes: 360 + BlockDeviceMappings: + - DeviceName: /dev/sda1 + Ebs: + VolumeSize: 300 + # SpotOptions: + # MaxPrice: "9.0" + + +worker_nodes: + InstanceType: p3.8xlarge + ImageId: ami-0698bcaf8bd9ef56d + # KeyName: ... + InstanceMarketOptions: + MarketType: spot + SpotOptions: + BlockDurationMinutes: 360 + BlockDeviceMappings: + - DeviceName: /dev/sda1 + Ebs: + VolumeSize: 300 + # SpotOptions: + # MaxPrice: "9.0" + # # Run workers on spot by default. Comment this out to use on-demand. + # InstanceMarketOptions: + # MarketType: spot + +setup_commands: + # This replaces the standard anaconda Ray installation + - ray || pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.9.0.dev0-cp36-cp36m-manylinux1_x86_64.whl + # Uncomment this and the filemount to update the Ray installation with your local Ray code + # - rm -rf ./anaconda3/lib/python3.6/site-packages/ray/util/sgd/ + # - cp -rf ~/sgd ./anaconda3/lib/python3.6/site-packages/ray/util/ + + # Installing this without -U to make sure we don't replace the existing Ray installation + - pip install ray[rllib] + - pip install -U ipdb torch torchvision tqdm + # Install Apex + - rm -rf apex || true + - git clone https://github.com/NVIDIA/apex && cd apex && pip install -v --no-cache-dir ./ || true + + # Install timm and get data + - pip install timm + - ls data || (curl https://s3.amazonaws.com/fast-ai-imageclas/imagenette2.tgz -O && tar zxvf imagenette2.tgz && mv imagenette2 data) + + +file_mounts: { + # This should point to ray/python/ray/util/sgd. + # ~/anaconda3/lib/python3.6/site-packages/ray/util/sgd/: ../../../../sgd, +} + +# Custom commands that will be run on the head node after common setup. +head_setup_commands: [] + +# Custom commands that will be run on worker nodes after common setup. +worker_setup_commands: [] + +# # Command to start ray on the head node. You don't need to change this. +head_start_ray_commands: + - ray stop + - ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --object-store-memory=1000000000 + +# Command to start ray on worker nodes. You don't need to change this. +worker_start_ray_commands: + - ray stop + - ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --object-store-memory=1000000000 + diff --git a/python/ray/util/sgd/torch/examples/image_models/get_data.sh b/python/ray/util/sgd/torch/examples/image_models/get_data.sh new file mode 100755 index 000000000..239f6296c --- /dev/null +++ b/python/ray/util/sgd/torch/examples/image_models/get_data.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash +curl https://s3.amazonaws.com/fast-ai-imageclas/imagenette2.tgz -O +tar zxvf imagenette2.tgz +mv imagenette2 data diff --git a/python/ray/util/sgd/torch/examples/image_models/train.py b/python/ray/util/sgd/torch/examples/image_models/train.py new file mode 100644 index 000000000..26d070dd9 --- /dev/null +++ b/python/ray/util/sgd/torch/examples/image_models/train.py @@ -0,0 +1,151 @@ +# Based on work by Ross Wightman as part of the timm package +# (see LICENSE_THIRDPARTY) +# +# As modified by +# - Maksim Smolin in 2020 + +# Note: other authors MUST include themselves in the above copyright notice +# in order to abide by the terms of the Apache license + +from os.path import join + +from tqdm import trange + +import torch.nn as nn + +from timm.data import Dataset, create_loader +from timm.data import resolve_data_config, FastCollateMixup +from timm.models import create_model, convert_splitbn_model +from timm.optim import create_optimizer +from timm.utils import setup_default_logging + +import ray +from ray.util.sgd.utils import BATCH_SIZE + +from ray.util.sgd import TorchTrainer +# from ray.util.sgd.torch import TrainingOperator + +from ray.util.sgd.torch.examples.image_models.args import parse_args + + +def model_creator(config): + args = config["args"] + + model = create_model( + "resnet101", # args.model, + pretrained=args.pretrained, + num_classes=args.num_classes, + drop_rate=args.drop, + drop_connect_rate=args.drop_connect, # DEPRECATED, use drop_path + drop_path_rate=args.drop_path, + drop_block_rate=args.drop_block, + global_pool=args.gp, + bn_tf=args.bn_tf, + bn_momentum=args.bn_momentum, + bn_eps=args.bn_eps, + checkpoint_path=args.initial_checkpoint) + + # always false right now + if args.split_bn: + assert args.num_aug_splits > 1 or args.resplit + model = convert_splitbn_model(model, max(args.num_aug_splits, 2)) + + return model + + +def data_creator(config): + # torch.manual_seed(args.seed + torch.distributed.get_rank()) + + args = config["args"] + + # todo: verbose should depend on rank + data_config = resolve_data_config(vars(args), verbose=True) + + dataset_train = Dataset(join(args.data, "train")) + dataset_eval = Dataset(join(args.data, "val")) + + collate_fn = None + if args.prefetcher and args.mixup > 0: + # collate conflict (need to support deinterleaving in collate mixup) + assert args.num_aug_splits == 0 + collate_fn = FastCollateMixup(args.mixup, args.smoothing, + args.num_classes) + + common_params = dict( + input_size=data_config["input_size"], + use_prefetcher=args.prefetcher, + mean=data_config["mean"], + std=data_config["std"], + num_workers=1, + distributed=args.distributed, + pin_memory=args.pin_mem) + + train_loader = create_loader( + dataset_train, + is_training=True, + batch_size=config[BATCH_SIZE], + re_prob=args.reprob, + re_mode=args.remode, + re_count=args.recount, + re_split=args.resplit, + collate_fn=collate_fn, + color_jitter=args.color_jitter, + auto_augment=args.aa, + interpolation=args.train_interpolation, + num_aug_splits=args.num_aug_splits, # always 0 right now + **common_params) + eval_loader = create_loader( + dataset_eval, + is_training=False, + batch_size=args.validation_batch_size_multiplier * config[BATCH_SIZE], + interpolation=data_config["interpolation"], + crop_pct=data_config["crop_pct"], + **common_params) + + return train_loader, eval_loader + + +def optimizer_creator(model, config): + args = config["args"] + return create_optimizer(args, model) + + +def loss_creator(config): + # there should be more complicated logic here, but we don't support + # separate train and eval losses yet + return nn.CrossEntropyLoss() + + +def main(): + setup_default_logging() + + args, args_text = parse_args() + + ray.init(address=args.ray_address) + + trainer = TorchTrainer( + model_creator=model_creator, + data_creator=data_creator, + optimizer_creator=optimizer_creator, + loss_creator=loss_creator, + use_tqdm=True, + use_fp16=args.amp, + apex_args={"opt_level": "O1"}, + config={ + "args": args, + BATCH_SIZE: args.batch_size + }, + num_workers=args.ray_num_workers) + + pbar = trange(args.epochs, unit="epoch") + for i in pbar: + trainer.train() + + val_stats = trainer.validate() + pbar.set_postfix(dict(acc=val_stats["val_accuracy"])) + + trainer.shutdown() + + +if __name__ == "__main__": + main()