diff --git a/doc/source/tune/_tutorials/overview.rst b/doc/source/tune/_tutorials/overview.rst
index 101ece283..9dac8b5a6 100644
--- a/doc/source/tune/_tutorials/overview.rst
+++ b/doc/source/tune/_tutorials/overview.rst
@@ -126,6 +126,16 @@ Learn how to use Tune in your browser with the following Colab-based exercises.
+
+
+ | Fine-tuning Huggingface Transformers with PBT. |
+ Huggingface Transformers/Pytorch |
+
+
+
+
+ |
+
Tutorial source files `can be found here `_.
diff --git a/python/ray/tune/examples/pbt_transformers/pbt_transformers.py b/python/ray/tune/examples/pbt_transformers/pbt_transformers.py
index 0e827da3f..ee144f6e3 100644
--- a/python/ray/tune/examples/pbt_transformers/pbt_transformers.py
+++ b/python/ray/tune/examples/pbt_transformers/pbt_transformers.py
@@ -2,6 +2,7 @@ import os
import ray
from ray.tune import CLIReporter
+from ray.tune.integration.wandb import wandb_mixin # noqa: F401
from ray.tune.schedulers import PopulationBasedTraining
from ray import tune
@@ -15,22 +16,15 @@ from transformers import (AutoConfig, AutoModelForSequenceClassification,
Trainer, TrainingArguments)
-def get_trainer(model_name_or_path,
- train_dataset,
- eval_dataset,
- task_name,
- training_args,
- wandb_args=None):
+def get_trainer(model_name_or_path, train_dataset, eval_dataset, task_name,
+ training_args):
try:
num_labels = glue_tasks_num_labels[task_name]
except KeyError:
raise ValueError("Task not found: %s" % (task_name))
config = AutoConfig.from_pretrained(
- model_name_or_path,
- num_labels=num_labels,
- finetuning_task=task_name,
- )
+ model_name_or_path, num_labels=num_labels, finetuning_task=task_name)
model = AutoModelForSequenceClassification.from_pretrained(
model_name_or_path,
@@ -41,8 +35,7 @@ def get_trainer(model_name_or_path,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
- compute_metrics=build_compute_metrics_fn(task_name),
- wandb_args=wandb_args)
+ compute_metrics=build_compute_metrics_fn(task_name))
return tune_trainer
@@ -62,6 +55,8 @@ def recover_checkpoint(tune_checkpoint_dir, model_name=None):
# __train_begin__
+# Uncomment this line to use W&B!
+# @wandb_mixin
def train_transformer(config, checkpoint_dir=None):
data_args = DataTrainingArguments(
task_name=config["task_name"], data_dir=config["data_dir"])
@@ -96,21 +91,9 @@ def train_transformer(config, checkpoint_dir=None):
logging_dir="./logs",
)
- # Arguments for W&B.
- name = tune.get_trial_name()
- wandb_args = {
- "project_name": "transformers_pbt",
- "watch": "false", # Either set to gradient, false, or all
- "run_name": name,
- }
-
tune_trainer = get_trainer(
recover_checkpoint(checkpoint_dir, config["model_name"]),
- train_dataset,
- eval_dataset,
- config["task_name"],
- training_args,
- wandb_args=wandb_args)
+ train_dataset, eval_dataset, config["task_name"], training_args)
tune_trainer.train(
recover_checkpoint(checkpoint_dir, config["model_name"]))
@@ -159,6 +142,11 @@ def tune_transformer(num_samples=8,
"weight_decay": tune.uniform(0.0, 0.3),
"num_epochs": tune.choice([2, 3, 4, 5]),
"max_steps": 1 if smoke_test else -1, # Used for smoke test.
+ "wandb": {
+ "project": "pbt_transformers",
+ "reinit": True,
+ "allow_val_change": True
+ }
}
scheduler = PopulationBasedTraining(
diff --git a/python/ray/tune/examples/pbt_transformers/trainer.py b/python/ray/tune/examples/pbt_transformers/trainer.py
index 79c193a08..0911c0fd7 100644
--- a/python/ray/tune/examples/pbt_transformers/trainer.py
+++ b/python/ray/tune/examples/pbt_transformers/trainer.py
@@ -5,28 +5,20 @@ from typing import Dict, Optional, Tuple
from ray import tune
import transformers
-from transformers.file_utils import is_torch_tpu_available
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
import torch
from torch.utils.data import Dataset
-import wandb
-
logger = logging.getLogger(__name__)
"""A Trainer class integrated with Tune.
The only changes to the original transformers.Trainer are:
- Report eval metrics to Tune
- Save state using Tune's checkpoint directories
- - Pass in extra arguments for wandb
"""
class TuneTransformerTrainer(transformers.Trainer):
- def __init__(self, *args, wandb_args=None, **kwargs):
- self.wandb_args = wandb_args
- super().__init__(*args, **kwargs)
-
def get_optimizers(
self, num_training_steps: int
) -> Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]:
@@ -58,22 +50,3 @@ class TuneTransformerTrainer(transformers.Trainer):
os.path.join(output_dir, "optimizer.pt"))
torch.save(self.current_scheduler.state_dict(),
os.path.join(output_dir, "scheduler.pt"))
-
- def _setup_wandb(self):
- if self.is_world_master() and self.wandb_args is not None:
- wandb.init(
- project=self.wandb_args["project_name"],
- name=self.wandb_args["run_name"],
- id=self.wandb_args["run_name"],
- dir=tune.get_trial_dir(),
- config=vars(self.args),
- reinit=True,
- allow_val_change=True,
- resume=self.wandb_args["run_name"])
- # keep track of model topology and gradients, unsupported on TPU
- if not is_torch_tpu_available(
- ) and self.wandb_args["watch"] != "false":
- wandb.watch(
- self.model,
- log=self.wandb_args["watch"],
- log_freq=max(100, self.args.logging_steps))
diff --git a/python/ray/tune/integration/wandb.py b/python/ray/tune/integration/wandb.py
index 34d559634..66d13ee4e 100644
--- a/python/ray/tune/integration/wandb.py
+++ b/python/ray/tune/integration/wandb.py
@@ -103,10 +103,18 @@ def _set_api_key(wandb_config):
if api_key:
os.environ[WANDB_ENV_VAR] = api_key
elif not os.environ.get(WANDB_ENV_VAR):
+ try:
+ # Check if user is already logged into wandb.
+ wandb.ensure_configured()
+ if wandb.api.api_key:
+ logger.info("Already logged into W&B.")
+ return
+ except AttributeError:
+ pass
raise ValueError(
"No WandB API key found. Either set the {} environment "
- "variable or pass `api_key` or `api_key_file` in the config".
- format(WANDB_ENV_VAR))
+ "variable, pass `api_key` or `api_key_file` in the config, "
+ "or run `wandb login` from the command line".format(WANDB_ENV_VAR))
class _WandbLoggingProcess(Process):