Text Classification with Hugging Face

Fine-tune DistilBERT on IMDB sentiment classification and track training with GoodSeed via a Trainer callback.

Run it

pip install goodseed transformers datasets torch
python examples/hf_text_classification.py

Then view results:

goodseed serve

Full source

"""Fine-tune DistilBERT on IMDB sentiment and track with GoodSeed."""

import argparse
from pathlib import Path

import goodseed
from datasets import load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainerCallback,
    TrainingArguments,
)


class GoodSeedCallback(TrainerCallback):
    """Log training metrics to a GoodSeed run."""

    def __init__(self, run):
        self.run = run

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is None:
            return
        step = state.global_step
        for key, value in logs.items():
            if isinstance(value, (int, float)):
                self.run[key].log(value, step=step)


def main():
    parser = argparse.ArgumentParser(description="HF text classification + GoodSeed")
    parser.add_argument("--epochs", type=int, default=3)
    parser.add_argument("--batch-size", type=int, default=16)
    parser.add_argument("--lr", type=float, default=2e-5)
    parser.add_argument("--max-samples", type=int, default=2000,
                        help="Subset size for fast iteration (0 = full dataset)")
    args = parser.parse_args()

    model_name = "distilbert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    dataset = load_dataset("imdb")
    if args.max_samples > 0:
        dataset["train"] = dataset["train"].shuffle(seed=42).select(range(args.max_samples))
        dataset["test"] = dataset["test"].shuffle(seed=42).select(range(args.max_samples))

    def tokenize(batch):
        return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=256)

    dataset = dataset.map(tokenize, batched=True)
    dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    output_dir = Path(__file__).parent / "hf_output"
    training_args = TrainingArguments(
        output_dir=str(output_dir),
        num_train_epochs=args.epochs,
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=args.batch_size,
        learning_rate=args.lr,
        eval_strategy="steps",
        eval_steps=20,
        logging_steps=1,
        save_strategy="no",
        report_to="none",
    )

    with goodseed.Run(name=f"imdb-{model_name}", tags=["hf", "imdb", "distilbert"]) as run:
        run["parameters"] = args
        run["model"] = model_name
        run["dataset"] = "imdb"
        run["device"] = str(training_args.device)

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=dataset["train"],
            eval_dataset=dataset["test"],
            callbacks=[GoodSeedCallback(run)],
        )

        trainer.train()

        eval_result = trainer.evaluate()
        run["eval/loss"] = eval_result["eval_loss"]
        print(f"\nEval loss: {eval_result['eval_loss']:.4f}")


if __name__ == "__main__":
    main()

Walkthrough

The key pattern here is a TrainerCallback that bridges Hugging Face’s Trainer logging to GoodSeed:

class GoodSeedCallback(TrainerCallback):
    """Log training metrics to a GoodSeed run."""

    def __init__(self, run):
        self.run = run

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is None:
            return
        step = state.global_step
        for key, value in logs.items():
            if isinstance(value, (int, float)):
                self.run[key].log(value, step=step)

The on_log hook fires whenever the Trainer logs metrics. It iterates the logs dict and forwards every numeric value to GoodSeed with the current global step.

The run is opened as a context manager, and configs are logged using argparse namespace flattening:

with goodseed.Run(name=f"imdb-{model_name}", tags=["hf", "imdb", "distilbert"]) as run:
    run["parameters"] = args
    run["model"] = model_name
    run["dataset"] = "imdb"
    run["device"] = str(training_args.device)

Passing args (an argparse.Namespace) to run["parameters"] automatically flattens it into parameters/epochs, parameters/batch_size, parameters/lr, and parameters/max_samples.

The callback is passed to the Trainer:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    callbacks=[GoodSeedCallback(run)],
)

After training, a final eval metric is stored as a config:

eval_result = trainer.evaluate()
run["eval/loss"] = eval_result["eval_loss"]

What gets tracked

Category	Fields
Configs	`parameters/epochs`, `parameters/batch_size`, `parameters/lr`, `parameters/max_samples`, `model`, `dataset`, `device`
Metrics	`loss`, `learning_rate`, `grad_norm`, `eval_loss`, `eval_runtime`, etc. (all Trainer metrics, per step)
Final	`eval/loss`
Auto	CPU/memory usage, stdout, git state (via GoodSeed monitoring)

Customize the run:

python examples/hf_text_classification.py --epochs 5 --lr 5e-5 --batch-size 32 --max-samples 5000