Full-text search: Specify namespaces for indexing

pull/620/head
gnosygnu 7 years ago
parent 10d13a3cd9
commit a4380b6d48

@ -105,7 +105,7 @@ public class Db_conn {
public int Exec_sql_args(String sql, Object... args) {return this.Exec_qry(Db_qry_sql.dml_(String_.Format(sql, args)));}
public int Exec_sql_plog_ntx(String msg, String sql) {return Exec_sql_plog(Bool_.N, msg, sql);}
public int Exec_sql_plog_txn(String msg, String sql) {return Exec_sql_plog(Bool_.Y, msg, sql);}
public int Exec_sql_plog(boolean txn, String msg, String sql) {
public int Exec_sql_plog(boolean txn, String msg, String sql) {
Gfo_usr_dlg_.Instance.Plog_many("", "", msg);
if (txn) this.Txn_bgn(msg);
int rv = Exec_sql(sql);

@ -0,0 +1,55 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.addons.wikis.fulltexts.indexers.bldrs; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*; import gplx.xowa.addons.wikis.fulltexts.indexers.*;
import gplx.xowa.wikis.nss.*;
public class Xofulltext_indexer_args implements Gfo_invk {
public byte[] wikis;
public String ns_ids;
public void Init_by_wiki(Xowe_wiki wiki) {
// wikis: null
if (wikis == null)
wikis = wiki.Domain_bry();
// ns: null / *
if (ns_ids == null)
ns_ids = "0";
else if (String_.Eq(ns_ids, "*")) {
Xow_ns[] ns_ary = wiki.Ns_mgr().Ords_ary();
int len = ns_ary.length;
Bry_bfr bfr = Bry_bfr_.New();
for (int i = 0; i < len; i++) {
Xow_ns ns = ns_ary[i];
int ns_id = ns.Id();
if (ns_id < 0) continue; // ignore media, special
if (i != 0) bfr.Add_byte(Byte_ascii.Pipe);
bfr.Add_int_variable(ns_id);
}
ns_ids = bfr.To_str_and_clear();
}
}
public Object Invk(GfsCtx ctx, int ikey, String k, GfoMsg m) {
if (ctx.Match(k, "wikis_")) this.wikis = m.ReadBryOr("v", null);
else if (ctx.Match(k, "ns_ids")) this.ns_ids = m.ReadStrOr("v", null);
else return Gfo_invk_.Rv_unhandled;
return this;
}
public static Xofulltext_indexer_args New_by_json(gplx.langs.jsons.Json_nde args) {
Xofulltext_indexer_args rv = new Xofulltext_indexer_args();
rv.wikis = args.Get_as_bry("wikis");
rv.ns_ids = args.Get_as_str("ns_ids");
return rv;
}
}

@ -16,10 +16,15 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
package gplx.xowa.addons.wikis.fulltexts.indexers.bldrs; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*; import gplx.xowa.addons.wikis.fulltexts.indexers.*;
import gplx.xowa.bldrs.*; import gplx.xowa.bldrs.wkrs.*;
public class Xofulltext_indexer_cmd extends Xob_cmd__base {
private final Xofulltext_indexer_args args = new Xofulltext_indexer_args();
public Xofulltext_indexer_cmd(Xob_bldr bldr, Xowe_wiki wiki) {super(bldr, wiki);}
@Override public void Cmd_run() {
wiki.Init_assert();
new Xofulltext_indexer_mgr().Exec(wiki, null);
new Xofulltext_indexer_mgr().Exec(wiki, null, args);
}
@Override public Object Invk(GfsCtx ctx, int ikey, String k, GfoMsg m) {
if (ctx.Match(k, "args")) return args;
else return Gfo_invk_.Rv_unhandled;
}
@Override public String Cmd_key() {return "search.index";}

@ -20,19 +20,28 @@ import gplx.xowa.wikis.data.*;
import gplx.xowa.htmls.core.dbs.*;
import gplx.xowa.addons.wikis.fulltexts.indexers.svcs.*;
public class Xofulltext_indexer_mgr {
public void Exec(Xowe_wiki wiki, Xofulltext_indexer_ui ui) {
public void Exec(Xowe_wiki wiki, Xofulltext_indexer_ui ui, Xofulltext_indexer_args args) {
// init indexer
Xofulltext_indexer_wkr indexer = new Xofulltext_indexer_wkr();
indexer.Init(wiki);
// get page tbl
Xow_db_file core_db = wiki.Data__core_mgr().Db__core();
gplx.xowa.wikis.data.tbls.Xowd_page_tbl page_tbl = core_db.Tbl__page();
// init args
args.Init_by_wiki(wiki);
int count = 0;
Xoh_page hpg = new Xoh_page();
Xofulltext_indexer_wkr indexer = new Xofulltext_indexer_wkr();
indexer.Init(wiki);
// get rdr and loop
Db_conn conn = page_tbl.Conn();
Db_rdr rdr = conn.Exec_rdr("SELECT page_id, page_score, page_namespace, page_title, page_html_db_id FROM page WHERE page_namespace = 0;");
int count = 0;
Db_rdr rdr = conn.Exec_rdr(Db_sql_.Make_by_fmt(String_.Ary
( "SELECT page_id, page_score, page_namespace, page_title, page_html_db_id"
, "FROM page"
, "WHERE page_namespace IN ({0});"), String_.Replace(args.ns_ids, "|", ",")));
while (rdr.Move_next()) {
// read vars
int page_namespace = rdr.Read_int("page_namespace");
byte[] page_ttl_bry = rdr.Read_bry_by_str("page_title");
int page_id = rdr.Read_int("page_id");
@ -52,7 +61,10 @@ public class Xofulltext_indexer_mgr {
continue;
byte[] html_text = wiki.Html__hdump_mgr().Load_mgr().Parse(hpg, hpg.Db().Html().Zip_tid(), hpg.Db().Html().Hzip_tid(), hpg.Db().Html().Html_bry());
// run index
indexer.Index(page_id, page_score, page_ttl.Page_txt(), html_text);
// notify
if ((++count % 10000) == 0) {
Gfo_usr_dlg_.Instance.Prog_many("", "", "indexing page: ~{0}", count);
if (ui != null)
@ -63,6 +75,7 @@ public class Xofulltext_indexer_mgr {
}
}
// term indexer
indexer.Term();
}
}

@ -16,14 +16,16 @@ Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
package gplx.xowa.addons.wikis.fulltexts.indexers.specials; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.fulltexts.*; import gplx.xowa.addons.wikis.fulltexts.indexers.*;
import gplx.langs.mustaches.*;
public class Xofulltext_indexer_doc implements Mustache_doc_itm {
private final byte[] wikis_bry;
public Xofulltext_indexer_doc
( byte[] wikis_bry) {
private final String wikis_bry, ns_ids;
public Xofulltext_indexer_doc(String wikis_bry, String ns_ids) {
this.wikis_bry = wikis_bry;
this.ns_ids = ns_ids;
}
public boolean Mustache__write(String key, Mustache_bfr bfr) {
if (String_.Eq(key, "wikis"))
bfr.Add_bry(wikis_bry);
bfr.Add_str_u8(wikis_bry);
else if (String_.Eq(key, "ns_ids"))
bfr.Add_str_u8(ns_ids);
else
return false;
return true;

@ -17,15 +17,15 @@ package gplx.xowa.addons.wikis.fulltexts.indexers.specials; import gplx.*; impor
import gplx.xowa.specials.*; import gplx.langs.mustaches.*; import gplx.xowa.wikis.pages.*; import gplx.xowa.wikis.pages.tags.*;
import gplx.dbs.*;
class Xofulltext_indexer_html extends Xow_special_wtr__base {
private final byte[] wikis_bry;
public Xofulltext_indexer_html
( byte[] wikis_bry) {
private final String wikis_bry, ns_ids;
public Xofulltext_indexer_html(String wikis_bry, String ns_ids) {
this.wikis_bry = wikis_bry;
this.ns_ids = ns_ids;
}
@Override protected Io_url Get_addon_dir(Xoa_app app) {return Addon_dir(app);}
@Override protected Io_url Get_mustache_fil(Io_url addon_dir) {return addon_dir.GenSubFil_nest("bin", "xofulltext_indexer.template.html");}
@Override protected Mustache_doc_itm Bld_mustache_root(Xoa_app app) {
return new Xofulltext_indexer_doc(wikis_bry);
return new Xofulltext_indexer_doc(wikis_bry, ns_ids);
}
@Override protected void Bld_tags(Xoa_app app, Io_url addon_dir, Xopage_html_data page_data) {
Xopg_tag_mgr head_tags = page_data.Head_tags();

@ -20,12 +20,12 @@ public class Xofulltext_indexer_special implements Xow_special_page {
public void Special__gen(Xow_wiki wiki, Xoa_page page, Xoa_url url, Xoa_ttl ttl) {
// get qry if any
Gfo_qarg_mgr url_args = new Gfo_qarg_mgr().Init(url.Qargs_ary());
byte[] wikis_bry = url_args.Read_bry_or("wikis", Bry_.Empty);
// get options and create page
// Xocfg_mgr cfg_mgr = wiki.App().Cfg();
new Xofulltext_indexer_html
( wikis_bry
( url_args.Read_str_or("wikis", wiki.Domain_str())
, url_args.Read_str_or("ns_ids", "0")
).Bld_page_by_mustache(wiki.App(), page, this);
}
Xofulltext_indexer_special(Xow_special_meta special__meta) {this.special__meta = special__meta;}

@ -29,8 +29,7 @@ class Xofulltext_indexer_svc implements Gfo_invk {
}
public void Index(Json_nde args) {
// create args
byte[] wikis_bry = args.Get_as_bry("wikis");
Xofulltext_indexer_args indexer_args = new Xofulltext_indexer_args(wikis_bry);
Xofulltext_indexer_args indexer_args = Xofulltext_indexer_args.New_by_json(args);
// launch thread
gplx.core.threads.Thread_adp_.Start_by_val("index", Cancelable_.Never, this, Invk__index, indexer_args);
@ -47,6 +46,7 @@ class Xofulltext_indexer_svc implements Gfo_invk {
continue;
}
// check if dir exists
wiki.Init_by_wiki();
Io_url search_dir = Xosearch_fulltext_addon.Get_index_dir(wiki);
if (Io_mgr.Instance.ExistsDir(search_dir)) {
@ -55,11 +55,14 @@ class Xofulltext_indexer_svc implements Gfo_invk {
continue;
}
// notify bgn
app.Gui__cbk_mgr().Send_json(cbk_trg, "xo.fulltext_indexer.status__note__recv", gplx.core.gfobjs.Gfobj_nde.New()
.Add_str("note", Datetime_now.Get().XtoStr_fmt_yyyy_MM_dd_HH_mm_ss() + ": wiki index started: " + String_.new_u8(domain)));
new Xofulltext_indexer_mgr().Exec((Xowe_wiki)wiki, new Xofulltext_indexer_ui(app.Gui__cbk_mgr(), cbk_trg));
// run index
new Xofulltext_indexer_mgr().Exec((Xowe_wiki)wiki, new Xofulltext_indexer_ui(app.Gui__cbk_mgr(), cbk_trg), args);
// notify end
app.Gui__cbk_mgr().Send_json(cbk_trg, "xo.fulltext_indexer.status__note__recv", gplx.core.gfobjs.Gfobj_nde.New()
.Add_str("note", Datetime_now.Get().XtoStr_fmt_yyyy_MM_dd_HH_mm_ss() + ": wiki index ended: " + String_.new_u8(domain)));
}
@ -72,9 +75,3 @@ class Xofulltext_indexer_svc implements Gfo_invk {
}
private static final String Invk__index = "index";
}
class Xofulltext_indexer_args {
public byte[] wikis;
public Xofulltext_indexer_args(byte[] wikis) {
this.wikis = wikis;
}
}

@ -63,13 +63,13 @@ public class Gflucene_searcher_mgr {
IndexReader reader = DirectoryReader.open(index);
IndexSearcher searcher = new IndexSearcher(reader);
Query query = new QueryParser("body", analyzer).parse(data.query);
// Query multi_query = MultiFieldQueryParser.parse(data.query, new String[] {"body"}, new BooleanClause.Occur []{BooleanClause.Occur.SHOULD}, analyzer);
// Query query = new QueryParser("body", analyzer).parse(data.query);
Query multi_query = MultiFieldQueryParser.parse(data.query, new String[] {"body"}, new BooleanClause.Occur []{BooleanClause.Occur.SHOULD}, analyzer);
// Query body_query = new QueryParser("body", analyzer).parse(data.query);
// Query title_query = new QueryParser("title", analyzer).parse(data.query);
// FunctionQuery boost_query = new FunctionQuery(new LongFieldSource("page_score"));
// CustomScoreQuery query = new CustomScoreQuery(multi_query, boost_query);
FunctionQuery boost_query = new FunctionQuery(new LongFieldSource("page_score"));
CustomScoreQuery query = new CustomScoreQuery(multi_query, boost_query);
// TopDocs docs = searcher.search(query, reader.maxDoc());
TopDocs docs = searcher.search(query, data.match_max);

Loading…
Cancel
Save