はじめに

このチュートリアルでは、ONNX model zooで学習済みのGoogLeNetネットワークを使って、opencv_dnnモジュールで画像分類を行う方法を学ぶ。

この例の結果を次の画像で示す。

ソースコード

サンプルアプリケーションの一部を抜粋して使用する。これはこちらからダウンロードできる。

#include <fstream>
#include <sstream>
#include <iostream>
 
#include <opencv2/dnn.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/core/utils/logger.hpp>
 
#include "common.hpp"
 
using namespace cv;
using namespace std;
using namespace dnn;
 
const string about =
 "Use this script to run a classification model on a camera stream, video, image or image list (i.e. .xml or .yaml containing image lists)\n\n"
 "Firstly, download required models using `download_models.py` (if not already done). Set environment variable OPENCV_DOWNLOAD_CACHE_DIR to specify where models should be downloaded. Also, point OPENCV_SAMPLES_DATA_PATH to opencv/samples/data.\n"
 "To run:\n"
 "\t ./example_dnn_classification model_name --input=path/to/your/input/image/or/video (don't give --input flag if want to use device camera)\n"
 "Sample command:\n"
 "\t ./example_dnn_classification resnet --input=$OPENCV_SAMPLES_DATA_PATH/baboon.jpg\n"
 "\t ./example_dnn_classification squeezenet\n"
 "Model path can also be specified using --model argument. "
 "Use imagelist_creator to create the xml or yaml list\n";
 
const string param_keys =
 "{ help  h         |                   | Print help message. }"
 "{ @alias          |                   | An alias name of model to extract preprocessing parameters from models.yml file. }"
 "{ zoo             | ../dnn/models.yml | An optional path to file with preprocessing parameters }"
 "{ input i         |                   | Path to input image or video file. Skip this argument to capture frames from a camera.}"
 "{ imglist         |                   | Pass this flag if image list (i.e. .xml or .yaml) file is passed}"
 "{ crop            |       false       | Preprocess input image by center cropping.}"
 //"{ labels          |                   | Path to the text file with labels for detected objects.}"
 "{ model           |                   | Path to the model file.}";
 
const string backend_keys = format(
 "{ backend          | default | Choose one of computation backends: "
 "default: automatically (by default), "
 "openvino: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
 "opencv: OpenCV implementation, "
 "vkcom: VKCOM, "
 "cuda: CUDA, "
 "webnn: WebNN }");
 
const string target_keys = format(
 "{ target           | cpu | Choose one of target computation devices: "
 "cpu: CPU target (by default), "
 "opencl: OpenCL, "
 "opencl_fp16: OpenCL fp16 (half-float precision), "
 "vpu: VPU, "
 "vulkan: Vulkan, "
 "cuda: CUDA, "
 "cuda_fp16: CUDA fp16 (half-float preprocess) }");
 
string keys = param_keys + backend_keys + target_keys;
 
vector<string> classes;
static bool readStringList( const string& filename, vector<string>& l )
{
    l.resize(0);
 FileStorage fs(filename, FileStorage::READ);
 if( !fs.isOpened() )
 return false;
 size_t dir_pos = filename.rfind('/');
 if (dir_pos == string::npos)
        dir_pos = filename.rfind('\\');
 FileNode n = fs.getFirstTopLevelNode();
 if( n.type() != FileNode::SEQ )
 return false;
 FileNodeIterator it = n.begin(), it_end = n.end();
 for( ; it != it_end; ++it )
    {
 string fname = (string)*it;
 if (dir_pos != string::npos)
        {
 string fpath = samples::findFile(filename.substr(0, dir_pos + 1) + fname, false);
 if (fpath.empty())
            {
                fpath = samples::findFile(fname);
            }
            fname = fpath;
        }
 else
        {
            fname = samples::findFile(fname);
        }
        l.push_back(fname);
    }
 return true;
}
 
int main(int argc, char** argv)
{
    utils::logging::setLogLevel(utils::logging::LOG_LEVEL_INFO);
 
 CommandLineParser parser(argc, argv, keys);
 
 if (!parser.has("@alias") || parser.has("help"))
    {
        cout << about << endl;
        parser.printMessage();
 return -1;
    }
 const string modelName = parser.get<String>("@alias");
 const string zooFile = findFile(parser.get<String>("zoo"));
 
    keys += genPreprocArguments(modelName, zooFile);
    parser = CommandLineParser(argc, argv, keys);
    parser.about(about);
 if (argc == 1 || parser.has("help"))
    {
        parser.printMessage();
 return 0;
    }
 String sha1 = parser.get<String>("sha1");
 float scale = parser.get<float>("scale");
 Scalar mean = parser.get<Scalar>("mean");
 Scalar std = parser.get<Scalar>("std");
 bool swapRB = parser.get<bool>("rgb");
 bool crop = parser.get<bool>("crop");
 int inpWidth = parser.get<int>("width");
 int inpHeight = parser.get<int>("height");
 String model = findModel(parser.get<String>("model"), sha1);
 String backend = parser.get<String>("backend");
 String target = parser.get<String>("target");
 bool isImgList = parser.has("imglist");
 
 // Open file with labels.
 string labels_filename = parser.get<String>("labels");
 string file = findFile(labels_filename);
    ifstream ifs(file.c_str());
 if (!ifs.is_open()){
        cout<<"File " << file << " not found";
        exit(1);
    }
 string line;
 while (getline(ifs, line))
    {
        classes.push_back(line);
    }
 if (!parser.check())
    {
        parser.printErrors();
 return 1;
    }
 CV_Assert(!model.empty());
 EngineType engine = ENGINE_AUTO;
 if (backend != "default" || target != "cpu"){
        engine = ENGINE_CLASSIC;
    }
    Net net = readNetFromONNX(model, engine);
    net.setPreferableBackend(getBackendID(backend));
    net.setPreferableTarget(getTargetID(target));
    net.setProfilingMode(DNN_PROFILE_SUMMARY);
 
 // Create a window
 static const std::string kWinName = "Deep learning image classification in OpenCV";
 namedWindow(kWinName, WINDOW_NORMAL);
 
 //Create FontFace for putText
 FontFace sans("sans");
 
 VideoCapture cap;
    vector<string> imageList;
 size_t currentImageIndex = 0;
 
 if (parser.has("input")) {
 string input = findFile(parser.get<String>("input"));
 
 if (isImgList) {
 bool check = readStringList(samples::findFile(input), imageList);
 if (imageList.empty() || !check) {
                cout << "Error: No images found or the provided file is not a valid .yaml or .xml file." << endl;
 return -1;
            }
        } else {
 // Input is not a directory, try to open as video or image
            cap.open(input);
 if (!cap.isOpened()) {
                cout << "Failed to open the input." << endl;
 return -1;
            }
        }
    } else {
        cap.open(0); // Open default camera
    }
 
 Mat frame, blob;
 for(;;)
    {
 if (!imageList.empty()) {
 // Handling directory of images
 if (currentImageIndex >= imageList.size()) {
 waitKey();
 break; // Exit if all images are processed
            }
            frame = imread(imageList[currentImageIndex++]);
 if(frame.empty()){
                cout<<"Cannot open file"<<endl;
 continue;
            }
        } else {
 // Handling video or single image
            cap >> frame;
        }
 if (frame.empty())
        {
 break;
        }
 blobFromImage(frame, blob, scale, Size(inpWidth, inpHeight), mean, swapRB, crop);
 // Check std values.
 if (std.val[0] != 0.0 && std.val[1] != 0.0 && std.val[2] != 0.0)
        {
 // Divide blob by std.
 divide(blob, std, blob);
        }
        net.setInput(blob);
 
 TickMeter timeRecorder;
        timeRecorder.reset();
 Mat prob = net.forward();
 double t1;
        timeRecorder.start();
        prob = net.forward();
        timeRecorder.stop();
        net.printPerfProfile();
 
 int N = (int)prob.total(), K = std::min(5, N);
        std::vector<std::pair<float, int> > prob_vec;
 for (int i = 0; i < N; i++) {
            prob_vec.push_back(std::make_pair(-prob.at<float>(i), i));
        }
        std::sort(prob_vec.begin(), prob_vec.end());
 
        t1 = timeRecorder.getTimeMilli();
        timeRecorder.reset();
 string label = format("Inference time: %.1f ms", t1);
 Mat subframe = frame(Rect(0, 0, std::min(1000, frame.cols), std::min(300, frame.rows)));
        subframe *= 0.3f;
 putText(frame, label, Point(20, 50), Scalar(0, 255, 0), sans, 25, 800);
 
 // Print predicted class.
 for (int i = 0; i < K; i++) {
 int classId = prob_vec[i].second;
 float confidence = -prob_vec[i].first;
            label = format("%d. %s: %.2f", i+1, (classes.empty() ? format("Class #%d", classId).c_str() :
                                        classes[classId].c_str()), confidence);
 putText(frame, label, Point(20, 110 + i*35), Scalar(0, 255, 0), sans, 25, 500);
        }
 imshow(kWinName, frame);
 int key = waitKey(isImgList ? 1000 : 100);
 if (key == ' ')
            key = waitKey();
 if (key == 'q' || key == 27) // Check if 'q' or 'ESC' is pressed
 return 0;
    }
 waitKey();
 return 0;
}

解説

まず、GoogLeNetのモデルファイルをダウンロードする:
python download_models.py googlenet

また、ILSVRC2012のクラス名が記載されたファイルも必要である: classification_classes_ILSVRC2012.txt。

これらのファイルをこのプログラム例の作業ディレクトリに置く。
.onnxファイルへのパスを使ってネットワークを読み込み、初期化する
EngineType engine = ENGINE_AUTO;

if (backend != "default" || target != "cpu"){

engine = ENGINE_CLASSIC;

}

Net net = readNetFromONNX(model, engine);

net.setPreferableBackend(getBackendID(backend));

net.setPreferableTarget(getTargetID(target));

net.setProfilingMode(DNN_PROFILE_SUMMARY);
入力画像を読み込み、GoogleNet が受け付けられる blob に変換する
VideoCapture cap;

vector<string> imageList;

size_t currentImageIndex = 0;

if (parser.has("input")) {

string input = findFile(parser.get<String>("input"));

if (isImgList) {

bool check = readStringList(samples::findFile(input), imageList);

if (imageList.empty() || !check) {

cout << "Error: No images found or the provided file is not a valid .yaml or .xml file." << endl;

return -1;

}

} else {

// Input is not a directory, try to open as video or image

cap.open(input);

if (!cap.isOpened()) {

cout << "Failed to open the input." << endl;

return -1;

}

}

} else {

cap.open(0); // Open default camera

}

cv::VideoCapture は画像と動画の両方を読み込める。

blobFromImage(frame, blob, scale, Size(inpWidth, inpHeight), mean, swapRB, crop);

// Check std values.

if (std.val[0] != 0.0 && std.val[1] != 0.0 && std.val[2] != 0.0)

{

// Divide blob by std.

divide(blob, std, blob);

}

リサイズや青・緑・赤の各チャンネルに対する平均値の減算といった必要な前処理を適用したうえで、cv::dnn::blobFromImage関数を使って、画像を1x3x224x224の形状を持つ4次元のblob(いわゆるバッチ)に変換する。
blob をネットワークに渡す
net.setInput(blob);
順伝播を実行する
timeRecorder.start();

prob = net.forward();

timeRecorder.stop();

net.printPerfProfile();

順伝播の間に各ネットワーク層の出力が計算されるが、この例では最後の層の出力だけが必要である。
最も確からしいクラスを決定する
int N = (int)prob.total(), K = std::min(5, N);

std::vector<std::pair<float, int> > prob_vec;

for (int i = 0; i < N; i++) {

prob_vec.push_back(std::make_pair(-prob.at<float>(i), i));

}

std::sort(prob_vec.begin(), prob_vec.end());

1000個のILSVRC2012画像クラスそれぞれの確率を含むネットワークの出力をprob blobに格納する。そして、この中で最大値を持つ要素のインデックスを探す。このインデックスが画像のクラスに対応する。
コマンドラインから例を実行する
./example_dnn_classification googlenet

今回の画像では、99%を超える確信度でspace shuttleクラスという予測が得られる。


原著者	Vitaliy Lyudvichenko
互換性	OpenCV >= 4.5.4

目次

はじめに

ソースコード

解説