上QQ阅读APP看书,第一时间看更新
3.3.8 FieldScoreQuery
FieldScoreQuery叫作函数查询(通过数字型的字段影响排序结果),时间加权排序时会用到。
除了文本列上使用TF-IDF相似性的标准词查询之外,打分还参考数值列的相似性。相似性依赖于查询对象中的值和文档中的数值列中的值之间的距离。(例如,高斯函数,使用参数:m= [user input], s= 0.5)
例如,“猎头”使用搜索引擎找人。表示人的文档,有两列:
● description (文本列);
● age (数值列)。
想要找这样的文档:
description:(x y z) age:30
但是age不是过滤条件,而是score的一部分(对于30岁的人,乘积因子是1.0;对于25岁的人,乘积因子是0.8;等等)。
把ValueSourceQuery和TermQuery包装在CustomScoreQuery中实现。
public class AgeAndContentScoreQueryTest extends TestCase{ public class AgeAndContentScoreQuery extends CustomScoreQuery { protected float peakX; protected float sigma; //接收4个参数,其中subQuery表示文本列查询,valSrcQuery表示值查询 public AgeAndContentScoreQuery(Query subQuery, ValueSourceQuery valSrcQuery, float peakX, float sigma) { super(subQuery, valSrcQuery); this.setStrict(true); // 不要归一化从ValueSourceQuery得到的分值! this.peakX = peakX; // 哪个年纪的相关性最好 this.sigma = sigma; } @Override public float customScore(int doc, float subQueryScore, float valSrcScore){ // subQueryScore来源于内容查询的td-idf分值 float contentScore = subQueryScore; // valSrcScore是生日字段的值,表示成一个浮点数 // 把年纪值转换成高斯型年纪相关分值 float x = (2011- valSrcScore); // age float ageScore = (float) Math.exp(-Math.pow(x - peakX, 2) / 2*sigma*sigma); float finalScore = ageScore * contentScore; System.out.println("#contentScore: " + contentScore); System.out.println("#ageValue: " + (int)valSrcScore); System.out.println("#ageScore: " + ageScore); System.out.println("#finalScore: " + finalScore); System.out.println("+++++++++++++++++"); return finalScore; } } protected Directory directory; protected Analyzer analyzer = new WhitespaceAnalyzer(); protected String fieldNameContent = "content"; protected String fieldNameDOB = "dob"; protected void setUp() throws Exception { directory = new RAMDirectory(); analyzer = new WhitespaceAnalyzer(); //索引文档 String[] contents = {"foo baz1", "foo baz2 baz3", "baz4"}; int[] dobs = {1991, 1981, 1987}; // 生日 IndexWriter writer = new IndexWriter(directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED); for (int i = 0; i < contents.length; i++) { Document doc = new Document(); doc.add(new Field(fieldNameContent, contents[i], Field.Store.YES, Field.Index.ANALYZED)); // store并且index doc.add(new NumericField(fieldNameDOB, Field.Store.YES, true).setIntValue(dobs[i])); // store & index writer.addDocument(doc); } writer.close(); } public void testSearch() throws Exception { String inputTextQuery = "foo bar"; float peak = 27.0f; float sigma = 0.1f; QueryParser parser = new QueryParser(fieldNameContent, analyzer); Query contentQuery = parser.parse(inputTextQuery); ValueSourceQuery dobQuery = new ValueSourceQuery( new IntFieldSource(fieldNameDOB) ); // 或者FieldScoreQuery dobQuery = new // FieldScoreQuery(fieldNameDOB, Type.INT); CustomScoreQuery finalQuery = new AgeAndContentScoreQuery(contentQuery, dobQuery, peak, sigma); IndexSearcher searcher = new IndexSearcher(directory); TopDocs docs = searcher.search(finalQuery, 10); System.out.println("\n发现的文档:\n"); for(ScoreDoc match : docs.scoreDocs) { Document d = searcher.doc(match.doc); System.out.println("CONTENT: " + d.get(fieldNameContent) ); System.out.println("D.O.B.: " + d.get(fieldNameDOB) ); System.out.println("SCORE: " + match.score ); System.out.println("-----------------"); } } }
日期加权:
if (x >= now) { score = now / x; } else { score = (float) (now / (now+sigma) - sigma / x); }