From 6ccee1052684b86954a204628065049b3091adfb Mon Sep 17 00:00:00 2001 From: gnosygnu Date: Thu, 16 Mar 2017 17:02:19 -0400 Subject: [PATCH] Full-text search: Adjust indexing weights and scores --- gplx.gflucene/.classpath | 1 + .../indexers/Gflucene_indexer_mgr.java | 33 ++++++++++++------- .../searchers/Gflucene_searcher_mgr.java | 20 +++++++++-- 3 files changed, 39 insertions(+), 15 deletions(-) diff --git a/gplx.gflucene/.classpath b/gplx.gflucene/.classpath index 43bad2c2d..ca4582672 100644 --- a/gplx.gflucene/.classpath +++ b/gplx.gflucene/.classpath @@ -8,5 +8,6 @@ + diff --git a/gplx.gflucene/src/gplx/gflucene/indexers/Gflucene_indexer_mgr.java b/gplx.gflucene/src/gplx/gflucene/indexers/Gflucene_indexer_mgr.java index 8e6898e4d..8a2ef9ffb 100644 --- a/gplx.gflucene/src/gplx/gflucene/indexers/Gflucene_indexer_mgr.java +++ b/gplx.gflucene/src/gplx/gflucene/indexers/Gflucene_indexer_mgr.java @@ -35,7 +35,7 @@ public class Gflucene_indexer_mgr { private IndexWriterConfig config; private Directory index; private IndexWriter wtr; - private FieldType body_fld; + private FieldType body_fld_type; public Gflucene_indexer_mgr() { } @@ -61,28 +61,37 @@ public class Gflucene_indexer_mgr { // create writer try { wtr = new IndexWriter(index, config); -// ((TieredMergePolicy)config.getMergePolicy()). } catch (IOException e) { throw Err_.new_exc(e, "lucene_index", "failed to create writer"); } // create field for body - this.body_fld = new FieldType(); - body_fld.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); + this.body_fld_type = new FieldType(); + body_fld_type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); + body_fld_type.setTokenized(true); + body_fld_type.setStored(false); // body_fld.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); -// body_fld.setStored(true); - body_fld.setTokenized(true); // body_fld.setStoreTermVectors(true); // body_fld.setStoreTermVectorOffsets(true); } public void Exec(Gflucene_doc_data doc_data) { - // org.apache.lucene.document. - Document doc = new Document(); -// doc.add(new SortedNumericDocValuesField("page_score", data.score)); - doc.add(new StoredField("page_score", doc_data.score)); + Document doc = new Document(); + doc.add(new StoredField("page_id", doc_data.page_id)); - doc.add(new TextField("title", doc_data.title, Field.Store.YES)); - doc.add(new Field("body", doc_data.body, body_fld)); + doc.add(new NumericDocValuesField("page_score", doc_data.score)); + +// float score = ((float)doc_data.score / 1000000); +// float score = doc_data.score; + + TextField title_field = new TextField("title", doc_data.title, Field.Store.YES); +// title_field.setBoost(score * 1024); +// title_field.setBoost(score); + doc.add(title_field); + + Field body_field = new Field("body", doc_data.body, body_fld_type); +// body_field.setBoost(score); + doc.add(body_field); + try { wtr.addDocument(doc); } catch (IOException e) { diff --git a/gplx.gflucene/src/gplx/gflucene/searchers/Gflucene_searcher_mgr.java b/gplx.gflucene/src/gplx/gflucene/searchers/Gflucene_searcher_mgr.java index f778af859..974c561cb 100644 --- a/gplx.gflucene/src/gplx/gflucene/searchers/Gflucene_searcher_mgr.java +++ b/gplx.gflucene/src/gplx/gflucene/searchers/Gflucene_searcher_mgr.java @@ -24,7 +24,13 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.queries.CustomScoreQuery; +import org.apache.lucene.queries.function.FunctionQuery; +import org.apache.lucene.queries.function.valuesource.LongFieldSource; +import org.apache.lucene.queryparser.classic.MultiFieldQueryParser; import org.apache.lucene.queryparser.classic.QueryParser; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; @@ -56,9 +62,15 @@ public class Gflucene_searcher_mgr { try { IndexReader reader = DirectoryReader.open(index); IndexSearcher searcher = new IndexSearcher(reader); - - + Query query = new QueryParser("body", analyzer).parse(data.query); +// Query multi_query = MultiFieldQueryParser.parse(data.query, new String[] {"body"}, new BooleanClause.Occur []{BooleanClause.Occur.SHOULD}, analyzer); + +// Query body_query = new QueryParser("body", analyzer).parse(data.query); +// Query title_query = new QueryParser("title", analyzer).parse(data.query); +// FunctionQuery boost_query = new FunctionQuery(new LongFieldSource("page_score")); +// CustomScoreQuery query = new CustomScoreQuery(multi_query, boost_query); + // TopDocs docs = searcher.search(query, reader.maxDoc()); TopDocs docs = searcher.search(query, data.match_max); ScoreDoc[] hits = docs.scoreDocs; @@ -66,8 +78,10 @@ public class Gflucene_searcher_mgr { for(int i = 0; i < hits.length; i++) { int docId = hits[i].doc; Document d = searcher.doc(docId); - Gflucene_doc_data doc = new Gflucene_doc_data(Integer.parseInt(d.get("page_id")), Integer.parseInt(d.get("page_score")), d.get("title"), ""); +// Gflucene_doc_data doc = new Gflucene_doc_data(Integer.parseInt(d.get("page_id")), Integer.parseInt(d.get("page_score")), d.get("title"), ""); + Gflucene_doc_data doc = new Gflucene_doc_data(Integer.parseInt(d.get("page_id")), 0, d.get("title"), ""); doc.lucene_score = hits[i].score; +// Tfds.Write(doc.lucene_score, doc.title); list.Add(doc); }