自己动手写分布式搜索引擎
上QQ阅读APP看书,第一时间看更新

3.3.8 FieldScoreQuery

FieldScoreQuery叫作函数查询(通过数字型的字段影响排序结果),时间加权排序时会用到。

除了文本列上使用TF-IDF相似性的标准词查询之外,打分还参考数值列的相似性。相似性依赖于查询对象中的值和文档中的数值列中的值之间的距离。(例如,高斯函数,使用参数:m= [user input], s= 0.5)

例如,“猎头”使用搜索引擎找人。表示人的文档,有两列:

● description (文本列);

● age (数值列)。

想要找这样的文档:

        description:(x y z) age:30

但是age不是过滤条件,而是score的一部分(对于30岁的人,乘积因子是1.0;对于25岁的人,乘积因子是0.8;等等)。

把ValueSourceQuery和TermQuery包装在CustomScoreQuery中实现。

        public class AgeAndContentScoreQueryTest extends TestCase{
          public class AgeAndContentScoreQuery extends CustomScoreQuery  {
        protected float peakX;
        protected float sigma;


        //接收4个参数,其中subQuery表示文本列查询,valSrcQuery表示值查询
        public AgeAndContentScoreQuery(Query subQuery, ValueSourceQuery
  valSrcQuery, float peakX, float sigma) {
          super(subQuery, valSrcQuery);
          this.setStrict(true); // 不要归一化从ValueSourceQuery得到的分值!
          this.peakX = peakX;   // 哪个年纪的相关性最好
          this.sigma = sigma;
        }


        @Override
        public float customScore(int doc, float subQueryScore, float valSrcScore){
          // subQueryScore来源于内容查询的td-idf分值
          float contentScore = subQueryScore;


          // valSrcScore是生日字段的值,表示成一个浮点数
          // 把年纪值转换成高斯型年纪相关分值
          float x = (2011- valSrcScore); // age
          float ageScore = (float) Math.exp(-Math.pow(x - peakX, 2) / 2*sigma*sigma);


          float finalScore = ageScore * contentScore;


          System.out.println("#contentScore: " + contentScore);
          System.out.println("#ageValue:    " + (int)valSrcScore);
          System.out.println("#ageScore:    " + ageScore);
          System.out.println("#finalScore:   " + finalScore);
          System.out.println("+++++++++++++++++");


          return finalScore;
        }
      }


      protected Directory directory;
      protected Analyzer analyzer = new WhitespaceAnalyzer();
      protected String fieldNameContent = "content";
      protected String fieldNameDOB = "dob";


      protected void setUp() throws Exception  {
        directory = new RAMDirectory();
        analyzer = new WhitespaceAnalyzer();


        //索引文档
        String[] contents = {"foo baz1", "foo baz2 baz3", "baz4"};
        int[] dobs = {1991, 1981, 1987}; // 生日
        IndexWriter writer = new IndexWriter(directory, analyzer,
  IndexWriter.MaxFieldLength.UNLIMITED);
        for (int i = 0; i < contents.length; i++)  {
          Document doc = new Document();
          doc.add(new Field(fieldNameContent, contents[i], Field.Store.YES,
  Field.Index.ANALYZED)); // store并且index
          doc.add(new NumericField(fieldNameDOB, Field.Store.YES,
  true).setIntValue(dobs[i]));     // store & index
          writer.addDocument(doc);
        }
        writer.close();
      }


      public void testSearch() throws Exception {
        String inputTextQuery = "foo bar";
        float peak = 27.0f;
        float sigma = 0.1f;


        QueryParser parser = new QueryParser(fieldNameContent, analyzer);
        Query contentQuery = parser.parse(inputTextQuery);


        ValueSourceQuery dobQuery = new ValueSourceQuery( new
  IntFieldSource(fieldNameDOB) );
         // 或者FieldScoreQuery dobQuery = new
  // FieldScoreQuery(fieldNameDOB, Type.INT);


        CustomScoreQuery finalQuery = new AgeAndContentScoreQuery(contentQuery,
  dobQuery, peak, sigma);


        IndexSearcher searcher = new IndexSearcher(directory);
        TopDocs docs = searcher.search(finalQuery, 10);


        System.out.println("\n发现的文档:\n");
        for(ScoreDoc match : docs.scoreDocs)   {
          Document d = searcher.doc(match.doc);
          System.out.println("CONTENT: " + d.get(fieldNameContent) );
          System.out.println("D.O.B.:  " + d.get(fieldNameDOB) );
          System.out.println("SCORE:   " + match.score );
          System.out.println("-----------------");
        }
      }
  }

日期加权:

        if (x >= now) {
            score = now / x;
        } else {
            score = (float) (now / (now+sigma) - sigma / x);
        }