ElasticSearch查询的相关度分数是3部分综合的分数,使用的是TF/IDF算法(Term Frequency&Invest Document Frequency)
1、根据Term Frequency(词条出现频率)
我们查询的文本中的词条在本document中出现了多少次,出现次数越多,相关度越高。
例如搜索内容:hello world
在文档1::hello,I love china.中出现了hello,出现了一次
在文档2:hello world,how are you!中出现了hello world,相当于出现了两次,所以文档2的相关度分数高于文档1。
2、根据Inverse Document Frequency
根据查询的文本中的词条在索引的全部文档中出现了多少次,出现的次数越多,相关度越低。
例如搜索内容:hello world
在文档1:hello,what are you doing?中hello出现了一次。
在文档2:I like the world.中world出现了一次。
按照第1项算,这两个文档的分数是一样的,但是还要比较hello在该索引的所有文档中出现多少次,world在该索引的所有文档中出现多少次,假如hello在索引的所有文档中出现了500次,world出现了100次。那么文档2的相关度分数要高于文档1。
3、根据Field-length norm(字段长度规约)
field越长,相关度约低。
例如搜索内容:hello world,有下面两个文档。
文档1:{"title":"hello,what‘s your name?","content":{"qwieurowieuolsdjflk"}}
文档2:{"title":"hi,good morning","content":{"lkjkljkj....world"}}
在文档1的title字段中搜索到hello,在文档2的content字段中搜索到world,content字段的长度比title字段长,所以文档2的相关度低
4、演示查看分数是如何计算的
准备数据:
PUT /lib { "settings":{ "number_of_shards":3, "number_of_replicas":0 }, "mappings":{ "user":{ "properties":{ "name":{"type":"text"}, "address":{"type":"text"}, "age":{"type":"integer"}, "interests":{ "type":"text" }, "birthday":{"type":"date"} } } } }
put /lib/user/1 { "name":"zhaoliu", "address":"hei long jiang sheng tie ling shi", "age":50, "birthday":"1970-12-12", "interests":"xi huang hejiu,duanlian,lvyou" } put /lib/user/2 { "name":"zhaoming", "address":"bei jing hai dian qu qing he zhen", "age":20, "birthday":"1998-10-12", "interests":"xi huan hejiu,duanlian,changge" } put /lib/user/3 { "name":"lisi", "address":"bei jing hai dian qu qing he zhen", "age":23, "birthday":"1998-10-12", "interests":"xi huan hejiu,duanlian,changge" } put /lib/user/4 { "name":"wangwu", "address":"bei jing hai dian qu qing he zhen", "age":26, "birthday":"1998-10-12", "interests":"xi huan biancheng,tingyinyue,lvyou" } put /lib/user/5 { "name":"zhangsan", "address":"bei jing chao yang qu", "age":29, "birthday":"1988-10-12", "interests":"xi huan tingyinyue,changge,tiaowu" }
在查询后面添加explain=true
GET lib/user/_search?explain=true { "query": { "match": { "interests": "duanlian,changge" } } }
查询结果,可以看到3部分的分数,加起来是总的分数
{ "took": 8, "timed_out": false, "_shards": { "total": 5, "successful": 5, "skipped": 0, "failed": 0 }, "hits": { "total": 4, "max_score": 1.3862944, "hits": [ { "_shard": "[lib][2]", "_node": "AJ3x6yc8TfKj6_zx6VRm0g", "_index": "lib", "_type": "user", "_id": "2", "_score": 1.3862944, "_source": { "name": "zhaoming", "address": "bei jing hai dian qu qing he zhen", "age": 20, "birthday": "1998-10-12", "interests": "xi huan hejiu,duanlian,changge" }, "_explanation": { "value": 1.3862944, "description": "sum of:", "details": [ { "value": 0.6931472, "description": "weight(interests:duanlian in 0) [PerFieldSimilarity], result of:", "details": [ { "value": 0.6931472, "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:", "details": [ { "value": 0.6931472, "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", "details": [ { "value": 1, "description": "docFreq", "details": [] }, { "value": 2, "description": "docCount", "details": [] } ] }, { "value": 1, "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:", "details": [ { "value": 1, "description": "termFreq=1.0", "details": [] }, { "value": 1.2, "description": "parameter k1", "details": [] }, { "value": 0.75, "description": "parameter b", "details": [] }, { "value": 5, "description": "avgFieldLength", "details": [] }, { "value": 5, "description": "fieldLength", "details": [] } ] } ] } ] }, { "value": 0.6931472, "description": "weight(interests:changge in 0) [PerFieldSimilarity], result of:", "details": [ { "value": 0.6931472, "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:", "details": [ { "value": 0.6931472, "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", "details": [ { "value": 1, "description": "docFreq", "details": [] }, { "value": 2, "description": "docCount", "details": [] } ] }, { "value": 1, "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:", "details": [ { "value": 1, "description": "termFreq=1.0", "details": [] }, { "value": 1.2, "description": "parameter k1", "details": [] }, { "value": 0.75, "description": "parameter b", "details": [] }, { "value": 5, "description": "avgFieldLength", "details": [] }, { "value": 5, "description": "fieldLength", "details": [] } ] } ] } ] } ] } }, { "_shard": "[lib][4]", "_node": "AJ3x6yc8TfKj6_zx6VRm0g", "_index": "lib", "_type": "user", "_id": "3", "_score": 0.5753642, "_source": { "name": "lisi", "address": "bei jing hai dian qu qing he zhen", "age": 23, "birthday": "1998-10-12", "interests": "xi huan hejiu,duanlian,changge" }, "_explanation": { "value": 0.5753642, "description": "sum of:", "details": [ { "value": 0.2876821, "description": "weight(interests:duanlian in 0) [PerFieldSimilarity], result of:", "details": [ { "value": 0.2876821, "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:", "details": [ { "value": 0.2876821, "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", "details": [ { "value": 1, "description": "docFreq", "details": [] }, { "value": 1, "description": "docCount", "details": [] } ] }, { "value": 1, "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:", "details": [ { "value": 1, "description": "termFreq=1.0", "details": [] }, { "value": 1.2, "description": "parameter k1", "details": [] }, { "value": 0.75, "description": "parameter b", "details": [] }, { "value": 5, "description": "avgFieldLength", "details": [] }, { "value": 5, "description": "fieldLength", "details": [] } ] } ] } ] }, { "value": 0.2876821, "description": "weight(interests:changge in 0) [PerFieldSimilarity], result of:", "details": [ { "value": 0.2876821, "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:", "details": [ { "value": 0.2876821, "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", "details": [ { "value": 1, "description": "docFreq", "details": [] }, { "value": 1, "description": "docCount", "details": [] } ] }, { "value": 1, "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:", "details": [ { "value": 1, "description": "termFreq=1.0", "details": [] }, { "value": 1.2, "description": "parameter k1", "details": [] }, { "value": 0.75, "description": "parameter b", "details": [] }, { "value": 5, "description": "avgFieldLength", "details": [] }, { "value": 5, "description": "fieldLength", "details": [] } ] } ] } ] } ] } }, { "_shard": "[lib][1]", "_node": "AJ3x6yc8TfKj6_zx6VRm0g", "_index": "lib", "_type": "user", "_id": "5", "_score": 0.2876821, "_source": { "name": "zhangsan", "address": "bei jing chao yang qu", "age": 29, "birthday": "1988-10-12", "interests": "xi huan tingyinyue,changge,tiaowu" }, "_explanation": { "value": 0.2876821, "description": "sum of:", "details": [ { "value": 0.2876821, "description": "weight(interests:changge in 0) [PerFieldSimilarity], result of:", "details": [ { "value": 0.2876821, "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:", "details": [ { "value": 0.2876821, "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", "details": [ { "value": 1, "description": "docFreq", "details": [] }, { "value": 1, "description": "docCount", "details": [] } ] }, { "value": 1, "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:", "details": [ { "value": 1, "description": "termFreq=1.0", "details": [] }, { "value": 1.2, "description": "parameter k1", "details": [] }, { "value": 0.75, "description": "parameter b", "details": [] }, { "value": 5, "description": "avgFieldLength", "details": [] }, { "value": 5, "description": "fieldLength", "details": [] } ] } ] } ] } ] } }, { "_shard": "[lib][3]", "_node": "AJ3x6yc8TfKj6_zx6VRm0g", "_index": "lib", "_type": "user", "_id": "1", "_score": 0.2876821, "_source": { "name": "zhaoliu", "address": "hei long jiang sheng tie ling shi", "age": 50, "birthday": "1970-12-12", "interests": "xi huang hejiu,duanlian,lvyou" }, "_explanation": { "value": 0.2876821, "description": "sum of:", "details": [ { "value": 0.2876821, "description": "weight(interests:duanlian in 0) [PerFieldSimilarity], result of:", "details": [ { "value": 0.2876821, "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:", "details": [ { "value": 0.2876821, "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", "details": [ { "value": 1, "description": "docFreq", "details": [] }, { "value": 1, "description": "docCount", "details": [] } ] }, { "value": 1, "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:", "details": [ { "value": 1, "description": "termFreq=1.0", "details": [] }, { "value": 1.2, "description": "parameter k1", "details": [] }, { "value": 0.75, "description": "parameter b", "details": [] }, { "value": 5, "description": "avgFieldLength", "details": [] }, { "value": 5, "description": "fieldLength", "details": [] } ] } ] } ] } ] } } ] } }
5、查看一个文档能否匹配上某个查询
使用上面的数据,id为2的可以匹配
GET /lib/user/2/_explain { "query":{ "match":{ "interests":"duanlian,changge" } } }
查询结果:
{ "_index": "lib", "_type": "user", "_id": "2", "matched": true, "explanation": { "value": 1.3862944, "description": "sum of:", "details": [ { "value": 0.6931472, "description": "weight(interests:duanlian in 0) [PerFieldSimilarity], result of:", "details": [ { "value": 0.6931472, "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:", "details": [ { "value": 0.6931472, "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", "details": [ { "value": 1, "description": "docFreq", "details": [] }, { "value": 2, "description": "docCount", "details": [] } ] }, { "value": 1, "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:", "details": [ { "value": 1, "description": "termFreq=1.0", "details": [] }, { "value": 1.2, "description": "parameter k1", "details": [] }, { "value": 0.75, "description": "parameter b", "details": [] }, { "value": 5, "description": "avgFieldLength", "details": [] }, { "value": 5, "description": "fieldLength", "details": [] } ] } ] } ] }, { "value": 0.6931472, "description": "weight(interests:changge in 0) [PerFieldSimilarity], result of:", "details": [ { "value": 0.6931472, "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:", "details": [ { "value": 0.6931472, "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", "details": [ { "value": 1, "description": "docFreq", "details": [] }, { "value": 2, "description": "docCount", "details": [] } ] }, { "value": 1, "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:", "details": [ { "value": 1, "description": "termFreq=1.0", "details": [] }, { "value": 1.2, "description": "parameter k1", "details": [] }, { "value": 0.75, "description": "parameter b", "details": [] }, { "value": 5, "description": "avgFieldLength", "details": [] }, { "value": 5, "description": "fieldLength", "details": [] } ] } ] } ] } ] } }
使用上面的数据,id为10的不能匹配:
GET /lib/user/10/_explain { "query":{ "match":{ "interests":"duanlian,changge" } } }
查询结果:
{ "_index": "lib", "_type": "user", "_id": "10", "matched": false }
原文:https://www.cnblogs.com/javasl/p/12661972.html