はじめに

このチュートリアルでは、AKAZE と ORB の局所特徴を比較し、それらを用いて動画フレーム間のマッチを求め、物体の動きを追跡する。

アルゴリズムは次のとおりである。

最初のフレームでキーポイントを検出・記述し、物体の境界を手動で設定する
For every next frame:
1. キーポイントを検出・記述する
2. 総当たりマッチャーを用いてマッチングする
3. RANSACを用いてホモグラフィ変換を推定する
4. すべてのマッチからインライアを抽出する
5. バウンディングボックスにホモグラフィ変換を適用して物体を見つける
6. バウンディングボックスとインライアを描画し、評価指標としてインライア率を計算する

データ

追跡を行うには、動画と最初のフレームでの物体の位置が必要である。

サンプル動画とデータはこちらからダウンロードできる。

コードを実行するには、入力 (カメラID または video_file) を指定する必要がある。その後、マウスでバウンディングボックスを選択し、任意のキーを押すと追跡を開始する

./planar_tracking blais.mp4

ソースコード

#include <opencv2/features2d.hpp>
#include <opencv2/videoio.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/calib3d.hpp>
#include <opencv2/highgui.hpp> //for imshow
#include <vector>
#include <iostream>
#include <iomanip>
 
#include "stats.h" // Stats structure definition
#include "utils.h" // Drawing and printing functions
 
using namespace std;
using namespace cv;
 
const double akaze_thresh = 3e-4; // AKAZE detection threshold set to locate about 1000 keypoints
const double ransac_thresh = 2.5f; // RANSAC inlier threshold
const double nn_match_ratio = 0.8f; // Nearest-neighbour matching ratio
const int bb_min_inliers = 100; // Minimal number of inliers to draw bounding box
const int stats_update_period = 10; // On-screen statistics are updated every 10 frames
 
namespace example {
class Tracker
{
public:
 Tracker(Ptr<Feature2D> _detector, Ptr<DescriptorMatcher> _matcher) :
        detector(_detector),
        matcher(_matcher)
    {}
 
 void setFirstFrame(const Mat frame, vector<Point2f> bb, string title, Stats& stats);
 Mat process(const Mat frame, Stats& stats);
 Ptr<Feature2D> getDetector() {
 return detector;
    }
protected:
 Ptr<Feature2D> detector;
 Ptr<DescriptorMatcher> matcher;
 Mat first_frame, first_desc;
    vector<KeyPoint> first_kp;
    vector<Point2f> object_bb;
};
 
void Tracker::setFirstFrame(const Mat frame, vector<Point2f> bb, string title, Stats& stats)
{
 cv::Point *ptMask = new cv::Point[bb.size()];
 const Point* ptContain = { &ptMask[0] };
 int iSize = static_cast<int>(bb.size());
 for (size_t i=0; i<bb.size(); i++) {
        ptMask[i].x = static_cast<int>(bb[i].x);
        ptMask[i].y = static_cast<int>(bb[i].y);
    }
    first_frame = frame.clone();
 cv::Mat matMask = cv::Mat::zeros(frame.size(), CV_8UC1);
 cv::fillPoly(matMask, &ptContain, &iSize, 1, cv::Scalar::all(255));
    detector->detectAndCompute(first_frame, matMask, first_kp, first_desc);
    stats.keypoints = (int)first_kp.size();
    drawBoundingBox(first_frame, bb);
 putText(first_frame, title, Point(0, 60), FONT_HERSHEY_PLAIN, 5, Scalar::all(0), 4);
    object_bb = bb;
 delete[] ptMask;
}
 
Mat Tracker::process(const Mat frame, Stats& stats)
{
 TickMeter tm;
    vector<KeyPoint> kp;
 Mat desc;
 
    tm.start();
    detector->detectAndCompute(frame, noArray(), kp, desc);
    stats.keypoints = (int)kp.size();
 
    vector< vector<DMatch> > matches;
    vector<KeyPoint> matched1, matched2;
    matcher->knnMatch(first_desc, desc, matches, 2);
 for(unsigned i = 0; i < matches.size(); i++) {
 if(matches[i][0].distance < nn_match_ratio * matches[i][1].distance) {
            matched1.push_back(first_kp[matches[i][0].queryIdx]);
            matched2.push_back(      kp[matches[i][0].trainIdx]);
        }
    }
    stats.matches = (int)matched1.size();
 
 Mat inlier_mask, homography;
    vector<KeyPoint> inliers1, inliers2;
    vector<DMatch> inlier_matches;
 if(matched1.size() >= 4) {
        homography = findHomography(Points(matched1), Points(matched2),
                                    RANSAC, ransac_thresh, inlier_mask);
    }
    tm.stop();
    stats.fps = 1. / tm.getTimeSec();
 
 if(matched1.size() < 4 || homography.empty()) {
 Mat res;
 hconcat(first_frame, frame, res);
        stats.inliers = 0;
        stats.ratio = 0;
 return res;
    }
 for(unsigned i = 0; i < matched1.size(); i++) {
 if(inlier_mask.at<uchar>(i)) {
 int new_i = static_cast<int>(inliers1.size());
            inliers1.push_back(matched1[i]);
            inliers2.push_back(matched2[i]);
            inlier_matches.push_back(DMatch(new_i, new_i, 0));
        }
    }
    stats.inliers = (int)inliers1.size();
    stats.ratio = stats.inliers * 1.0 / stats.matches;
 
    vector<Point2f> new_bb;
 perspectiveTransform(object_bb, new_bb, homography);
 Mat frame_with_bb = frame.clone();
 if(stats.inliers >= bb_min_inliers) {
        drawBoundingBox(frame_with_bb, new_bb);
    }
 Mat res;
 drawMatches(first_frame, inliers1, frame_with_bb, inliers2,
                inlier_matches, res,
 Scalar(255, 0, 0), Scalar(255, 0, 0));
 return res;
}
}
 
int main(int argc, char **argv)
{
 CommandLineParser parser(argc, argv, "{@input_path |0|input path can be a camera id, like 0,1,2 or a video filename}");
    parser.printMessage();
 string input_path = parser.get<string>(0);
 string video_name = input_path;
 
 VideoCapture video_in;
 
 if ( ( isdigit(input_path[0]) && input_path.size() == 1 ) )
    {
 int camera_no = input_path[0] - '0';
        video_in.open( camera_no );
    }
 else {
        video_in.open(video_name);
    }
 
 if(!video_in.isOpened()) {
        cerr << "Couldn't open " << video_name << endl;
 return 1;
    }
 
    Stats stats, akaze_stats, orb_stats;
 Ptr<AKAZE> akaze = AKAZE::create();
    akaze->setThreshold(akaze_thresh);
 Ptr<ORB> orb = ORB::create();
 Ptr<DescriptorMatcher> matcher = DescriptorMatcher::create("BruteForce-Hamming");
    example::Tracker akaze_tracker(akaze, matcher);
    example::Tracker orb_tracker(orb, matcher);
 
 Mat frame;
 namedWindow(video_name, WINDOW_NORMAL);
    cout << "\nPress any key to stop the video and select a bounding box" << endl;
 
 while ( waitKey(1) < 1 )
    {
        video_in >> frame;
 cv::resizeWindow(video_name, frame.size());
 imshow(video_name, frame);
    }
 
    vector<Point2f> bb;
 cv::Rect uBox = cv::selectROI(video_name, frame);
    bb.push_back(cv::Point2f(static_cast<float>(uBox.x), static_cast<float>(uBox.y)));
    bb.push_back(cv::Point2f(static_cast<float>(uBox.x+uBox.width), static_cast<float>(uBox.y)));
    bb.push_back(cv::Point2f(static_cast<float>(uBox.x+uBox.width), static_cast<float>(uBox.y+uBox.height)));
    bb.push_back(cv::Point2f(static_cast<float>(uBox.x), static_cast<float>(uBox.y+uBox.height)));
 
    akaze_tracker.setFirstFrame(frame, bb, "AKAZE", stats);
    orb_tracker.setFirstFrame(frame, bb, "ORB", stats);
 
    Stats akaze_draw_stats, orb_draw_stats;
 Mat akaze_res, orb_res, res_frame;
 int i = 0;
 for(;;) {
        i++;
 bool update_stats = (i % stats_update_period == 0);
        video_in >> frame;
 // stop the program if no more images
 if(frame.empty()) break;
 
        akaze_res = akaze_tracker.process(frame, stats);
        akaze_stats += stats;
 if(update_stats) {
            akaze_draw_stats = stats;
        }
 
        orb->setMaxFeatures(stats.keypoints);
        orb_res = orb_tracker.process(frame, stats);
        orb_stats += stats;
 if(update_stats) {
            orb_draw_stats = stats;
        }
 
        drawStatistics(akaze_res, akaze_draw_stats);
        drawStatistics(orb_res, orb_draw_stats);
 vconcat(akaze_res, orb_res, res_frame);
 cv::imshow(video_name, res_frame);
 if(waitKey(1)==27) break; //quit on ESC button
    }
    akaze_stats /= i - 1;
    orb_stats /= i - 1;
    printStatistics("AKAZE", akaze_stats);
    printStatistics("ORB", orb_stats);
 return 0;
}

解説

Trackerクラス

このクラスは、与えられた特徴検出器と記述子マッチャーを用いて、上で説明したアルゴリズムを実装する。

最初のフレームの設定
void Tracker::setFirstFrame(const Mat frame, vector<Point2f> bb, string title, Stats& stats)

{

first_frame = frame.clone();

(*detector)(first_frame, noArray(), first_kp, first_desc);

stats.keypoints = (int)first_kp.size();

drawBoundingBox(first_frame, bb);

putText(first_frame, title, Point(0, 60), FONT_HERSHEY_PLAIN, 5, Scalar::all(0), 4);

object_bb = bb;

}

最初のフレームからキーポイントと記述子を計算して保存し、出力用に準備する。

両方の検出器がほぼ同じ数のキーポイントを検出していることを確認するため、検出されたキーポイントの数を保存する必要がある。
Processing frames
1. キーポイントを検出し、記述子を計算する
  (*detector)(frame, noArray(), kp, desc);
  
  フレーム間のマッチを求めるには、まずキーポイントを検出する必要がある。
  
  このチュートリアルでは、各フレームで約1000個のキーポイントを見つけるように検出器を設定している。
2. 2-nnマッチャーを用いて対応を求める
  matcher->knnMatch(first_desc, desc, matches, 2);
  
  for(unsigned i = 0; i < matches.size(); i++) {
  
  if(matches[i][0].distance < nn_match_ratio * matches[i][1].distance) {
  
  matched1.push_back(first_kp[matches[i][0].queryIdx]);
  
  matched2.push_back( kp[matches[i][0].trainIdx]);
  
  }
  
  }
  
  最も近いマッチが2番目に近いものより nn_match_ratio 倍近ければ、それはマッチである。
3. RANSAC を用いてホモグラフィ変換を推定する
  homography = findHomography(Points(matched1), Points(matched2),
  
  RANSAC, ransac_thresh, inlier_mask);
  
  マッチが少なくとも4個あれば、ランダムサンプルコンセンサスを用いて画像変換を推定できる。
4. インライアを保存する
  for(unsigned i = 0; i < matched1.size(); i++) {
  
  if(inlier_mask.at<uchar>(i)) {
  
  int new_i = static_cast<int>(inliers1.size());
  
  inliers1.push_back(matched1[i]);
  
  inliers2.push_back(matched2[i]);
  
  inlier_matches.push_back(DMatch(new_i, new_i, 0));
  
  }
  
  }
  
  findHomography がインライアを計算するので、選択された点とマッチを保存するだけでよい。
5. 物体のバウンディングボックスを射影する
  perspectiveTransform(object_bb, new_bb, homography);
  
  妥当な数のインライアがあれば、推定した変換を用いて物体の位置を特定できる。

結果

結果として得られた動画はYouTubeで視聴できる。

AKAZE の統計:

Matches      626
Inliers      410
Inlier ratio 0.58
Keypoints    1117

ORB の統計:

Matches      504
Inliers      319
Inlier ratio 0.56
Keypoints    1112


原著者	Fedor Morozov
互換性	OpenCV >= 3.0

目次

はじめに

データ

ソースコード

解説

Trackerクラス

結果