#1,原理
每张图片都可以生成颜色分布的直方图(color histogram)。如果两张图片的直方图很接近,就可以认为它们很相似。

任何一种颜色都是由红绿蓝三原色(RGB)构成的,所以上图共有4张直方图(三原色直方图 + 最后合成的直方图)。
如果每种原色都可以取256个值,那么整个颜色空间共有1600万种颜色(256的三次方)。针对这1600万种颜色比较直方图,计算量实在太大了,因此需要采用简化方法。可以将0~255分成四个区:0~63为第0区,64~127为第1区,128~191为第2区,192~255为第3区。这意味着红绿蓝分别有4个区,总共可以构成64种组合(4的3次方)。
任何一种颜色必然属于这64种组合中的一种,这样就可以统计每一种组合包含的像素数量。

上图是某张图片的颜色分布表,将表中最后一栏提取出来,组成一个64维向量(7414, 230, 0, 0, 8, ..., 109, 0, 0, 3415, 53929)。这个向量就是这张图片的特征值或者叫"指纹"。
于是,寻找相似图片就变成了找出与其最相似的向量。这可以用皮尔逊相关系数或者余弦相似度算出。
#2,c++代码实现
#include<math.h>
#include<bitset>
#include<iostream>
#include<vector>
#include<string>
#include<fstream>
#include <time.h>
#include<opencv2/core/core.hpp>
#include<opencv2/highgui/highgui.hpp>
using namespace std;
using namespace cv;
void getRGB(Mat &pic, vector<int > &PixVec);
int bit2int(bitset<2>& bbit, bitset<2>& gbit, bitset<2>& rbit);
bitset<2> classify(int val);
double correlation(vector<int> &Pix1, vector<int> &Pix2);
void getNameFromTxt(vector<string> &OrigNamePic, string FileName, string OrigFileName);
int main(){
	  double beginTime = clock();
	  string FileName="rawdata";
	  string oeder1 = "DIR .\\" + FileName + "\\*.jpg / B >FileNameList.TXT ";
	  system(oeder1.c_str());
	  vector<string> PicName;
	  getNameFromTxt(PicName, FileName, "FileNameList.TXT");
	  int Piclen = PicName.size();
	  for (int m = 0; m < Piclen; m++){
		    cout << "Compare the " << m << "-th picture with the others!" << endl;
		    for (int n = m+1; n < Piclen; n++){
			      Mat pic1 = imread(PicName[m], 1);
			      Mat pic2 = imread(PicName[n], 1);
			      //PixVec
			      vector<int> Pix1Vec(64, 0);
			      getRGB(pic1, Pix1Vec);
			      vector<int> Pix2Vec(64, 0);
			      getRGB(pic2, Pix2Vec);
			      double correlVal = correlation(Pix1Vec, Pix2Vec);
			      //cout << "The value of correlation coefficient is: " << correlVal << endl;
			      if (correlVal > 0.999999){
				        string movePic = "move .\\" + PicName[m]+" DeletePic >nul";
				        system(movePic.c_str());
				        break;
			      }
		    }	
	  }
	  double endTime = clock();
	  cout << "beginTime:" << beginTime << endl
		    << "endTime:" << endTime << endl
		    << "endTime-beginTime:" << endTime - beginTime << "ms" << endl;
		
	  system("Pause");
	  return 0;
}
void getNameFromTxt(vector<string> &OrigNamePic, string fileName, string OrigFileName){
	  ifstream OrigNameIn(OrigFileName);
	  while (!OrigNameIn.eof()){
		    string cacheNameO;
		    getline(OrigNameIn, cacheNameO);
		    int len = cacheNameO.size();
		    if (len>0){
			      string realName = fileName + "\\" + cacheNameO;
			      OrigNamePic.push_back(realName);
		    }
	  }
	  OrigNameIn.close();
	  string order = "del " + OrigFileName;
	  system(order.c_str());
}
void getRGB(Mat &pic, vector<int > &PixVec){
	  int rowNum=pic.rows;
	  int colNum=pic.cols;
	  int pixNum=colNum*pic.channels();
	
	  if(pic.channels()!=3)
		    cout<<"The channel of the picture is not 3!"<<endl;
	
	  Mat_<Vec3b>::iterator it=pic.begin<Vec3b>();
	  Mat_<Vec3b>::iterator itend=pic.end<Vec3b>();
		
	  for(;it!=itend;++it){
		    bitset<2>  bpix,gpix,rpix;
		    bpix=classify((*it)[0]);
		    gpix=classify((*it)[1]);
		    rpix=classify((*it)[2]);
		
		    int clasVal=bit2int(bpix, gpix, rpix);
		    PixVec[clasVal]++;
	  }
	
}
int bit2int(bitset<2>& bbit,bitset<2>& gbit,bitset<2>& rbit){
	  bitset<6> bitval;
	  for(int i=0;i<2;i++){
		    bitval[0*2+i]=rbit[i];
		    bitval[1*2+i]=gbit[i];
		    bitval[2*2+i]=bbit[i];
	  }
	  return bitval.to_ulong();
}
bitset<2> classify(int val){
	  if (val<64){
		    bitset<2> bitval(0);
		    return bitval;
	  }
	  else if (val<128){
		    bitset<2> bitval(1);
		    return bitval;
	  }
	  else if (val<192){
		    bitset<2> bitval(2);
		    return bitval;
	  }
	  else {//if(val<256)
		    bitset<2> bitval(3);
		    return bitval;
	  }
}
double correlation(vector<int> &Pix1, vector<int> &Pix2){
	  double XYsum=0.0, Xsum=0.0, Ysum=0.0;
	  double Xmean=0.0, Ymean=0.0;
	
	  int len=Pix1.size();
	
	  for(int i=0; i<len; i++){
		    Xmean += Pix1[i];
		    Ymean += Pix2[i];
	  }
	  Xmean =(double)Xmean/(double)len;
	  Ymean =(double)Ymean/(double)len;
	
	  for(int j=0;j<len;j++){
		    XYsum += ((double)Pix1[j]-Xmean)*((double)Pix2[j]-Ymean);
		    Xsum += ((double)Pix1[j]-Xmean)*((double)Pix1[j]-Xmean);
		    Ysum += ((double)Pix2[j]-Ymean)*((double)Pix2[j]-Ymean);
	  }
	
	  double finalVal=(double)XYsum/(double)(sqrt(Xsum)*sqrt(Ysum));
	  return finalVal;
}
#3,程序运行结果

原文:http://www.cnblogs.com/sophia-hxw/p/5674686.html