cpp/ja/hierarchical__clustering__index_8h_source.html

/***********************************************************************


* Software License Agreement (BSD License)


*


* Copyright 2008-2011  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.


* Copyright 2008-2011  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.


*


* THE BSD LICENSE


*


* Redistribution and use in source and binary forms, with or without


* modification, are permitted provided that the following conditions


* are met:


*


* 1. Redistributions of source code must retain the above copyright


*    notice, this list of conditions and the following disclaimer.


* 2. Redistributions in binary form must reproduce the above copyright


*    notice, this list of conditions and the following disclaimer in the


*    documentation and/or other materials provided with the distribution.


*


* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR


* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES


* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.


* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,


* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT


* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,


* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY


* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT


* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF


* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


*************************************************************************/


#ifndef OPENCV_FLANN_HIERARCHICAL_CLUSTERING_INDEX_H_


#define OPENCV_FLANN_HIERARCHICAL_CLUSTERING_INDEX_H_


#include <algorithm>


#include <map>


#include <limits>


#include <cmath>


#include "general.h"


#include "nn_index.h"


#include "dist.h"


#include "matrix.h"


#include "result_set.h"


#include "heap.h"


#include "allocator.h"


#include "random.h"


#include "saving.h"


namespace
cvflann


{


struct
HierarchicalClusteringIndexParams :
public
IndexParams


{


HierarchicalClusteringIndexParams(int
branching = 32,


flann_centers_init_t centers_init = FLANN_CENTERS_RANDOM,


int
trees = 4,
int
leaf_size = 100)


{


(*this)["algorithm"] = FLANN_INDEX_HIERARCHICAL;


// The branching factor used in the hierarchical clustering


(*this)["branching"] = branching;


// Algorithm used for picking the initial cluster centers


(*this)["centers_init"] = centers_init;


// number of parallel trees to build


(*this)["trees"] = trees;


// maximum leaf size


(*this)["leaf_size"] = leaf_size;


}


};


template
<typename
Distance>


class
HierarchicalClusteringIndex :
public
NNIndex<Distance>


{


public:


typedef
typename
Distance::ElementType ElementType;


typedef
typename
Distance::ResultType DistanceType;


private:


typedef
void (HierarchicalClusteringIndex::* centersAlgFunction)(int,
int*, int,
int*,
int&);


centersAlgFunction chooseCenters;


void
chooseCentersRandom(int
k,
int* dsindices,
int
indices_length,
int* centers,
int& centers_length)


{


UniqueRandom r(indices_length);


int
index;


for
(index=0; index<k; ++index) {


bool
duplicate =
true;


int
rnd;


while
(duplicate) {


duplicate =
false;


rnd = r.next();


if
(rnd<0) {


centers_length = index;


return;


}


centers[index] = dsindices[rnd];


for
(int
j=0; j<index; ++j) {


DistanceType sq = distance(dataset[centers[index]], dataset[centers[j]], dataset.cols);


if
(sq<1e-16) {


duplicate =
true;


}


}


}


}


centers_length = index;


}


void
chooseCentersGonzales(int
k,
int* dsindices,
int
indices_length,
int* centers,
int& centers_length)


{


int
n = indices_length;


int
rnd = rand_int(n);


CV_DbgAssert(rnd >=0 && rnd < n);


centers[0] = dsindices[rnd];


int
index;


for
(index=1; index<k; ++index) {


int
best_index = -1;


DistanceType best_val = 0;


for
(int
j=0; j<n; ++j) {


DistanceType dist = distance(dataset[centers[0]],dataset[dsindices[j]],dataset.cols);


for
(int
i=1; i<index; ++i) {


DistanceType tmp_dist = distance(dataset[centers[i]],dataset[dsindices[j]],dataset.cols);


if
(tmp_dist<dist) {


dist = tmp_dist;


}


}


if
(dist>best_val) {


best_val = dist;


best_index = j;


}


}


if
(best_index!=-1) {


centers[index] = dsindices[best_index];


}


else
{


break;


}


}


centers_length = index;


}


void
chooseCentersKMeanspp(int
k,
int* dsindices,
int
indices_length,
int* centers,
int& centers_length)


{


int
n = indices_length;


double
currentPot = 0;


DistanceType* closestDistSq =
new
DistanceType[n];


// Choose one random center and set the closestDistSq values


int
index = rand_int(n);


CV_DbgAssert(index >=0 && index < n);


centers[0] = dsindices[index];


// Computing distance^2 will have the advantage of even higher probability further to pick new centers


// far from previous centers (and this complies to "k-means++: the advantages of careful seeding" article)


for
(int
i = 0; i < n; i++) {


closestDistSq[i] = distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols);


closestDistSq[i] = ensureSquareDistance<Distance>( closestDistSq[i] );


currentPot += closestDistSq[i];


}


const
int
numLocalTries = 1;


// Choose each center


int
centerCount;


for
(centerCount = 1; centerCount < k; centerCount++) {


// Repeat several trials


double
bestNewPot = -1;


int
bestNewIndex = 0;


for
(int
localTrial = 0; localTrial < numLocalTries; localTrial++) {


// Choose our center - have to be slightly careful to return a valid answer even accounting


// for possible rounding errors


double
randVal = rand_double(currentPot);


for
(index = 0; index < n-1; index++) {


if
(randVal <= closestDistSq[index])
break;


else
randVal -= closestDistSq[index];


}


// Compute the new potential


double
newPot = 0;


for
(int
i = 0; i < n; i++) {


DistanceType dist = distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols);


newPot +=
std::min( ensureSquareDistance<Distance>(dist), closestDistSq[i] );


}


// Store the best result


if
((bestNewPot < 0)||(newPot < bestNewPot)) {


bestNewPot = newPot;


bestNewIndex = index;


}


}


// Add the appropriate center


centers[centerCount] = dsindices[bestNewIndex];


currentPot = bestNewPot;


for
(int
i = 0; i < n; i++) {


DistanceType dist = distance(dataset[dsindices[i]], dataset[dsindices[bestNewIndex]], dataset.cols);


closestDistSq[i] =
std::min( ensureSquareDistance<Distance>(dist), closestDistSq[i] );


}


}


centers_length = centerCount;


delete[] closestDistSq;


}


void
GroupWiseCenterChooser(int
k,
int* dsindices,
int
indices_length,
int* centers,
int& centers_length)


{


const
float
kSpeedUpFactor = 1.3f;


int
n = indices_length;


DistanceType* closestDistSq =
new
DistanceType[n];


// Choose one random center and set the closestDistSq values


int
index = rand_int(n);


CV_DbgAssert(index >=0 && index < n);


centers[0] = dsindices[index];


for
(int
i = 0; i < n; i++) {


closestDistSq[i] = distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols);


}


// Choose each center


int
centerCount;


for
(centerCount = 1; centerCount < k; centerCount++) {


// Repeat several trials


double
bestNewPot = -1;


int
bestNewIndex = 0;


DistanceType furthest = 0;


for
(index = 0; index < n; index++) {


// We will test only the potential of the points further than current candidate


if( closestDistSq[index] > kSpeedUpFactor * (float)furthest ) {


// Compute the new potential


double
newPot = 0;


for
(int
i = 0; i < n; i++) {


newPot +=
std::min( distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols)


, closestDistSq[i] );


}


// Store the best result


if
((bestNewPot < 0)||(newPot <= bestNewPot)) {


bestNewPot = newPot;


bestNewIndex = index;


furthest = closestDistSq[index];


}


}


}


// Add the appropriate center


centers[centerCount] = dsindices[bestNewIndex];


for
(int
i = 0; i < n; i++) {


closestDistSq[i] =
std::min( distance(dataset[dsindices[i]], dataset[dsindices[bestNewIndex]], dataset.cols)


, closestDistSq[i] );


}


}


centers_length = centerCount;


delete[] closestDistSq;


}


public:


HierarchicalClusteringIndex(const
Matrix<ElementType>& inputData,
const
IndexParams& index_params = HierarchicalClusteringIndexParams(),


Distance d = Distance())


: dataset(inputData), params(index_params), root(NULL), indices(NULL), distance(d)


{


memoryCounter = 0;


size_ = dataset.rows;


veclen_ = dataset.cols;


branching_ = get_param(params,"branching",32);


centers_init_ = get_param(params,"centers_init", FLANN_CENTERS_RANDOM);


trees_ = get_param(params,"trees",4);


leaf_size_ = get_param(params,"leaf_size",100);


if
(centers_init_==FLANN_CENTERS_RANDOM) {


chooseCenters = &HierarchicalClusteringIndex::chooseCentersRandom;


}


else
if
(centers_init_==FLANN_CENTERS_GONZALES) {


chooseCenters = &HierarchicalClusteringIndex::chooseCentersGonzales;


}


else
if
(centers_init_==FLANN_CENTERS_KMEANSPP) {


chooseCenters = &HierarchicalClusteringIndex::chooseCentersKMeanspp;


}


else
if
(centers_init_==FLANN_CENTERS_GROUPWISE) {


chooseCenters = &HierarchicalClusteringIndex::GroupWiseCenterChooser;


}


else
{


FLANN_THROW(cv::Error::StsError,
"Unknown algorithm for choosing initial centers.");


}


root =
new
NodePtr[trees_];


indices =
new
int*[trees_];


for
(int
i=0; i<trees_; ++i) {


root[i] = NULL;


indices[i] = NULL;


}


}


HierarchicalClusteringIndex(const
HierarchicalClusteringIndex&);


HierarchicalClusteringIndex& operator=(const
HierarchicalClusteringIndex&);


virtual
~HierarchicalClusteringIndex()


{


if
(root!=NULL) {


delete[] root;


}


if
(indices!=NULL) {


free_indices();


delete[] indices;


}


}


size_t
size() const CV_OVERRIDE


{


return
size_;


}


size_t
veclen() const CV_OVERRIDE


{


return
veclen_;


}


int
usedMemory() const CV_OVERRIDE


{


return
pool.usedMemory+pool.wastedMemory+memoryCounter;


}


void
buildIndex() CV_OVERRIDE


{


if
(branching_<2) {


FLANN_THROW(cv::Error::StsError,
"Branching factor must be at least 2");


}


free_indices();


for
(int
i=0; i<trees_; ++i) {


indices[i] =
new
int[size_];


for
(size_t
j=0; j<size_; ++j) {


indices[i][j] = (int)j;


}


root[i] = pool.allocate<Node>();


computeClustering(root[i], indices[i], (int)size_, branching_,0);


}


}


flann_algorithm_t getType() const CV_OVERRIDE


{


return
FLANN_INDEX_HIERARCHICAL;


}


void
saveIndex(FILE* stream) CV_OVERRIDE


{


save_value(stream, branching_);


save_value(stream, trees_);


save_value(stream, centers_init_);


save_value(stream, leaf_size_);


save_value(stream, memoryCounter);


for
(int
i=0; i<trees_; ++i) {


save_value(stream, *indices[i], size_);


save_tree(stream, root[i], i);


}


}


void
loadIndex(FILE* stream) CV_OVERRIDE


{


if
(root!=NULL) {


delete[] root;


}


if
(indices!=NULL) {


free_indices();


delete[] indices;


}


load_value(stream, branching_);


load_value(stream, trees_);


load_value(stream, centers_init_);


load_value(stream, leaf_size_);


load_value(stream, memoryCounter);


indices =
new
int*[trees_];


root =
new
NodePtr[trees_];


for
(int
i=0; i<trees_; ++i) {


indices[i] =
new
int[size_];


load_value(stream, *indices[i], size_);


load_tree(stream, root[i], i);


}


params["algorithm"] = getType();


params["branching"] = branching_;


params["trees"] = trees_;


params["centers_init"] = centers_init_;


params["leaf_size"] = leaf_size_;


}


void
findNeighbors(ResultSet<DistanceType>& result,
const
ElementType* vec,
const
SearchParams& searchParams) CV_OVERRIDE


{


const
int
maxChecks = get_param(searchParams,"checks",32);


const
bool
explore_all_trees = get_param(searchParams,"explore_all_trees",false);


// Priority queue storing intermediate branches in the best-bin-first search


Heap<BranchSt>* heap =
new
Heap<BranchSt>((int)size_);


std::vector<bool> checked(size_,false);


int
checks = 0;


for
(int
i=0; i<trees_; ++i) {


findNN(root[i], result, vec, checks, maxChecks, heap, checked, explore_all_trees);


if
(!explore_all_trees && (checks >= maxChecks) && result.full())


break;


}


BranchSt branch;


while
(heap->popMin(branch) && (checks<maxChecks || !result.full())) {


NodePtr node = branch.node;


findNN(node, result, vec, checks, maxChecks, heap, checked,
false);


}


delete
heap;


CV_Assert(result.full());


}


IndexParams getParameters() const CV_OVERRIDE


{


return
params;


}


private:


struct
Node


{


int
pivot;


int
size;


Node** childs;


int* indices;


int
level;


};


typedef
Node* NodePtr;


typedef
BranchStruct<NodePtr, DistanceType> BranchSt;


void
save_tree(FILE* stream, NodePtr node,
int
num)


{


save_value(stream, *node);


if
(node->childs==NULL) {


int
indices_offset = (int)(node->indices - indices[num]);


save_value(stream, indices_offset);


}


else
{


for(int
i=0; i<branching_; ++i) {


save_tree(stream, node->childs[i], num);


}


}


}


void
load_tree(FILE* stream, NodePtr& node,
int
num)


{


node = pool.allocate<Node>();


load_value(stream, *node);


if
(node->childs==NULL) {


int
indices_offset;


load_value(stream, indices_offset);


node->indices = indices[num] + indices_offset;


}


else
{


node->childs = pool.allocate<NodePtr>(branching_);


for(int
i=0; i<branching_; ++i) {


load_tree(stream, node->childs[i], num);


}


}


}


void
free_indices()


{


if
(indices!=NULL) {


for(int
i=0; i<trees_; ++i) {


if
(indices[i]!=NULL) {


delete[] indices[i];


indices[i] = NULL;


}


}


}


}


void
computeLabels(int* dsindices,
int
indices_length,
int* centers,
int
centers_length,
int* labels, DistanceType& cost)


{


cost = 0;


for
(int
i=0; i<indices_length; ++i) {


ElementType* point = dataset[dsindices[i]];


DistanceType dist = distance(point, dataset[centers[0]], veclen_);


labels[i] = 0;


for
(int
j=1; j<centers_length; ++j) {


DistanceType new_dist = distance(point, dataset[centers[j]], veclen_);


if
(dist>new_dist) {


labels[i] = j;


dist = new_dist;


}


}


cost += dist;


}


}


void
computeClustering(NodePtr node,
int* dsindices,
int
indices_length,
int
branching,
int
level)


{


node->size = indices_length;


node->level = level;


if
(indices_length < leaf_size_) {
// leaf node


node->indices = dsindices;


std::sort(node->indices,node->indices+indices_length);


node->childs = NULL;


return;


}


std::vector<int> centers(branching);


std::vector<int> labels(indices_length);


int
centers_length;


(this->*chooseCenters)(branching, dsindices, indices_length, &centers[0], centers_length);


if
(centers_length<branching) {


node->indices = dsindices;


std::sort(node->indices,node->indices+indices_length);


node->childs = NULL;


return;


}


//  assign points to clusters


DistanceType cost;


computeLabels(dsindices, indices_length, &centers[0], centers_length, &labels[0], cost);


node->childs = pool.allocate<NodePtr>(branching);


int
start = 0;


int
end = start;


for
(int
i=0; i<branching; ++i) {


for
(int
j=0; j<indices_length; ++j) {


if
(labels[j]==i) {


std::swap(dsindices[j],dsindices[end]);


std::swap(labels[j],labels[end]);


end++;


}


}


node->childs[i] = pool.allocate<Node>();


node->childs[i]->pivot = centers[i];


node->childs[i]->indices = NULL;


computeClustering(node->childs[i],dsindices+start, end-start, branching, level+1);


start=end;


}


}


void
findNN(NodePtr node, ResultSet<DistanceType>& result,
const
ElementType* vec,
int& checks,
int
maxChecks,


Heap<BranchSt>* heap, std::vector<bool>& checked,
bool
explore_all_trees =
false)


{


if
(node->childs==NULL) {


if
(!explore_all_trees && (checks>=maxChecks) && result.full()) {


return;


}


for
(int
i=0; i<node->size; ++i) {


int
index = node->indices[i];


if
(!checked[index]) {


DistanceType dist = distance(dataset[index], vec, veclen_);


result.addPoint(dist, index);


checked[index] =
true;


++checks;


}


}


}


else
{


DistanceType* domain_distances =
new
DistanceType[branching_];


int
best_index = 0;


domain_distances[best_index] = distance(vec, dataset[node->childs[best_index]->pivot], veclen_);


for
(int
i=1; i<branching_; ++i) {


domain_distances[i] = distance(vec, dataset[node->childs[i]->pivot], veclen_);


if
(domain_distances[i]<domain_distances[best_index]) {


best_index = i;


}


}


for
(int
i=0; i<branching_; ++i) {


if
(i!=best_index) {


heap->insert(BranchSt(node->childs[i],domain_distances[i]));


}


}


delete[] domain_distances;


findNN(node->childs[best_index],result,vec, checks, maxChecks, heap, checked, explore_all_trees);


}


}


private:


const
Matrix<ElementType> dataset;


IndexParams params;


size_t
size_;


size_t
veclen_;


NodePtr* root;


int** indices;


Distance distance;


PooledAllocator pool;


int
memoryCounter;


int
branching_;


int
trees_;


flann_centers_init_t centers_init_;


int
leaf_size_;


};


}


#endif

/* OPENCV_FLANN_HIERARCHICAL_CLUSTERING_INDEX_H_ */


cv::sort

CV_EXPORTS_W void sort(InputArray src, OutputArray dst, int flags)

Sorts each row or each column of a matrix.


cv::min

CV_EXPORTS_W void min(InputArray src1, InputArray src2, OutputArray dst)

Calculates per-element minimum of two arrays or an array and a scalar.


CV_Assert

#define CV_Assert(expr)

Checks a condition at runtime and throws exception if it fails


Definition:
base.hpp:342


CV_DbgAssert

#define CV_DbgAssert(expr)


Definition:
base.hpp:375


cv::swap

CV_EXPORTS void swap(Mat &a, Mat &b)

Swaps two matrices