-
Notifications
You must be signed in to change notification settings - Fork 54
Open
Description
I may have discovered a bug, so I wanted to report it. If I used it incorrectly, I would appreciate it if you would point it out.
I tried running the MNIST example code in https://github.com/pfnet/pytorch-pfn-extras/blob/master/example/mnist.py in a Jupyter Notebook by simply replacing command line arguments with dummy args. However, I encountered the following error: KeyError: "None of [Index(['epoch', 'iteration', 'train/loss', 'lr', 'model/fc2.bias/grad/min',\n 'val/loss', 'val/acc'],\n dtype='object')] are in the [columns]"
Here is the code I used in the Jupyter Notebook. The difference from the original one is only treatment of the command line arguments.
import pytorch_pfn_extras as ppe
import pytorch_pfn_extras.training.extensions as extensions
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
class Net(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(1, 20, 5, 1)
self.conv2 = nn.Conv2d(20, 50, 5, 1)
self.fc1 = nn.Linear(4 * 4 * 50, 500)
self.fc2 = nn.Linear(500, 10)
def forward(self, x):
x = F.relu(self.conv1(x))
x = F.max_pool2d(x, 2, 2)
x = F.relu(self.conv2(x))
x = F.max_pool2d(x, 2, 2)
x = x.flatten(start_dim=1)
x = F.relu(self.fc1(x))
x = self.fc2(x)
ppe.nn.ensure(x, shape=(None, 10))
return F.log_softmax(x, dim=1)
def train(manager, args, model, device, train_loader):
while not manager.stop_trigger:
model.train()
for _, (data, target) in enumerate(train_loader):
with manager.run_iteration(step_optimizers=["main"]):
data, target = data.to(device), target.to(device)
output = model(data)
loss = F.nll_loss(output, target)
ppe.reporting.report({"train/loss": loss.item()})
loss.backward()
def test(args, model, device, data, target):
"""The extension loops over the iterator in order to
drive the evaluator progress bar and reporting
averages
"""
model.eval()
data, target = data.to(device), target.to(device)
output = model(data)
# Final result will be average of averages of the same size
test_loss = F.nll_loss(output, target, reduction="mean").item()
ppe.reporting.report({"val/loss": test_loss})
pred = output.argmax(dim=1, keepdim=True)
correct = pred.eq(target.view_as(pred)).sum().item()
ppe.reporting.report({"val/acc": correct / len(data)})
def main():
# Training settings
class DummyArgs:
def __init__(self, **kwargs):
self.__dict__.update(kwargs)
args = DummyArgs(batch_size=64, test_batch_size=1000, epochs=10, lr=0.01, momentum=0.5, cuda=True, seed=1, save_model=False, snapshot=None,
slack=None)
use_cuda = args.cuda and torch.cuda.is_available()
torch.manual_seed(args.seed)
device = torch.device("cuda" if use_cuda else "cpu")
kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {}
train_loader = torch.utils.data.DataLoader(
datasets.MNIST(
"../data",
train=True,
download=True,
transform=transforms.Compose(
[
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,)),
]
),
),
batch_size=args.batch_size,
shuffle=True,
**kwargs, # type: ignore[arg-type]
)
test_loader = torch.utils.data.DataLoader(
datasets.MNIST(
"../data",
train=False,
transform=transforms.Compose(
[
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,)),
]
),
),
batch_size=args.test_batch_size,
shuffle=True,
**kwargs, # type: ignore[arg-type]
)
model = Net()
model.to(device)
optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
# manager.extend(...) also works
my_extensions = [
extensions.LogReport(),
# Enables TensorBoard support.
# Run `tensorboard --logdir runs` to launch the TensorBoard.
extensions.LogReport(
writer=ppe.writing.TensorBoardWriter(out_dir="runs"),
trigger=(1, "iteration"),
),
extensions.ProgressBar(),
extensions.observe_lr(optimizer=optimizer),
extensions.ParameterStatistics(model, prefix="model"),
extensions.VariableStatisticsPlot(model),
extensions.Evaluator(
test_loader,
model,
eval_func=lambda data, target: test(args, model, device, data, target),
progress_bar=True,
),
extensions.PlotReport(["train/loss", "val/loss"], "epoch", filename="loss.png"),
extensions.PrintReport(
[
"epoch",
"iteration",
"train/loss",
"lr",
"model/fc2.bias/grad/min",
"val/loss",
"val/acc",
]
),
extensions.snapshot(),
]
if args.slack is not None:
my_extensions.append(
extensions.Slack(
channel=args.slack,
msg="Epoch #{manager.epoch}: val/loss = {val/loss}",
# Surround the username with <> to mention.
end_msg="{default}\n<@your_slack_user_name>",
# Upload any artifacts generated during the training.
filenames=["result/statistics.png"],
# You can specify when to upload these files.
# e.g., only at the final epoch:
# upload_trigger=(args.epochs, 'epoch'),
)
)
# Custom stop triggers can be added to the manager and
# their status accessed through `manager.stop_trigger`
trigger = None
# trigger = ppe.training.triggers.EarlyStoppingTrigger(
# check_trigger=(1, 'epoch'), monitor='val/loss')
manager = ppe.training.ExtensionsManager(
model,
optimizer,
args.epochs,
extensions=my_extensions,
iters_per_epoch=len(train_loader),
stop_trigger=trigger,
)
# Lets load the snapshot
if args.snapshot is not None:
state = torch.load(args.snapshot)
manager.load_state_dict(state)
train(manager, args, model, device, train_loader)
# Test function is called from the evaluator extension
# to get access to the reporter and other facilities
# test(args, model, device, test_loader)
if args.save_model:
torch.save(model.state_dict(), "mnist_cnn.pt")
if __name__ == "__main__":
main()
Metadata
Metadata
Assignees
Labels
No labels