今回はPyTorchのC++ APIでGPUを使った学習と推論を試してみる.
GPUの使い方
torch::cuda::is_available()
torch::DeviceType device_type = torch::kCUDA;; torch::Device device(device_type); Net model; model.to(device);
template <typename DataLoader> void train( int32_t epoch, const Options& options, Net& model, torch::Device device, DataLoader& data_loader, torch::optim::Optimizer& optimizer, size_t dataset_size) { model.train(); torch::Tensor loss; for (auto& batch : data_loader) { optimizer.zero_grad(); auto data = batch.data.to(device), targets = batch.target.to(device); auto output = model.forward(data); loss = torch::nll_loss(output, targets); loss.backward(); optimizer.step(); //AT_ASSERT(!std::isnan(loss.template item<float>())); } std::cout << "Train Epoch: " << epoch << "\tLoss: " << loss.template item<float>() << std::endl; }
GPUでの学習
- 以下のネットワークを学習させてみる
struct Net : torch::nn::Module { Net() : conv1(torch::nn::Conv2dOptions(1, 10, /*kernel_size=*/5)), conv2(torch::nn::Conv2dOptions(10, 20, /*kernel_size=*/5)), fc1(320, 50), fc2(50, 10) { register_module("conv1", conv1); register_module("conv2", conv2); register_module("conv2_drop", conv2_drop); register_module("fc1", fc1); register_module("fc2", fc2); } torch::Tensor forward(torch::Tensor x) { x = torch::relu(torch::max_pool2d(conv1->forward(x), 2)); x = torch::relu( torch::max_pool2d(conv2_drop->forward(conv2->forward(x)), 2)); x = x.view({-1, 320}); x = torch::relu(fc1->forward(x)); x = torch::dropout(x, /*p=*/0.5, /*training=*/is_training()); x = fc2->forward(x); return torch::log_softmax(x, /*dim=*/1); } torch::nn::Conv2d conv1; torch::nn::Conv2d conv2; torch::nn::FeatureDropout conv2_drop; torch::nn::Linear fc1; torch::nn::Linear fc2; };
$ time ./mnist CUDA available! Training on GPU start training... Train Epoch: 1 Loss: 0.365448 Train Epoch: 2 Loss: 0.461339 Train Epoch: 3 Loss: 0.233072 Train Epoch: 4 Loss: 0.0650344 Train Epoch: 5 Loss: 0.0621544 Train Epoch: 6 Loss: 0.143679 Train Epoch: 7 Loss: 0.143441 Train Epoch: 8 Loss: 0.0391192 Train Epoch: 9 Loss: 0.0670136 Train Epoch: 10 Loss: 0.118915 real 1m13.645s user 1m1.240s sys 0m12.144s
ちなみにPython APIでは50sほどで学習できたので,GPUが使える環境での学習はPython APIの方が速いらしい
GPUでの推論
real 0m6.157s user 0m4.934s sys 0m1.199s
real 0m7.766s user 0m7.106s sys 0m1.748s
API | time (s) |
---|---|
Python | 7.766s |
C++ | 6.157s |
- こちらはC++の方が速い
Multi Layer Perceptronで実験
- CNNだけではなくMLPでも実験してみる
- 使うネットワークは以下のもの
struct Net : torch::nn::Module { Net() { // Construct and register two Linear submodules. fc1 = register_module("fc1", torch::nn::Linear(784, 64)); fc2 = register_module("fc2", torch::nn::Linear(64, 32)); fc3 = register_module("fc3", torch::nn::Linear(32, 10)); } // Implement the Net's algorithm. torch::Tensor forward(torch::Tensor x) { // Use one of many tensor manipulation functions. x = torch::relu(fc1->forward(x.reshape({x.size(0), 784}))); x = torch::dropout(x, /*p=*/0.5, /*train=*/is_training()); x = torch::relu(fc2->forward(x)); x = torch::log_softmax(fc3->forward(x), /*dim=*/1); return x; } // Use one of many "standard library" modules. torch::nn::Linear fc1{nullptr}, fc2{nullptr}, fc3{nullptr}; };
$ time ./simple_mnist Epoch: 1 | Loss: 2.25581 Epoch: 2 | Loss: 2.08061 Epoch: 3 | Loss: 1.72753 Epoch: 4 | Loss: 1.44198 Epoch: 5 | Loss: 1.16616 Epoch: 6 | Loss: 0.870318 Epoch: 7 | Loss: 0.827086 Epoch: 8 | Loss: 0.685862 Epoch: 9 | Loss: 0.626552 Epoch: 10 | Loss: 0.76815 real 0m4.942s user 0m4.130s sys 0m0.744s
API,device | time (m/s) |
---|---|
Python CPU | 1m18.185s |
Python GPU | 0m41.901s |
C++ CPU | 0m11.544s |
C++ GPU | 0m4.942s |
- C++だと畳み込みが遅いけど全結合はすごい速い