1
0
mirror of https://github.com/gnosygnu/xowa.git synced 2026-03-02 03:49:30 +00:00
This commit is contained in:
gnosygnu
2015-07-12 21:10:02 -04:00
commit 794b5a232f
3099 changed files with 238212 additions and 0 deletions

View File

@@ -0,0 +1,157 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.dbs;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.sql.*;
//public class Bug_Utf8 {
// public static void main(String[] s) throws SQLException, java.io.UnsupportedEncodingException {
// // init connection
// Connection conn = DriverManager.getConnection
// ( "jdbc:mysql://localhost/bug_utf8?characterEncoding=UTF8"
//// ( "jdbc:mysql://localhost/bug_utf8?useUnicode=true&characterEncoding=UTF8&character_set_client=UTF8&character_set_database=UTF8&character_set_results=UTF8&character_set_server=UTF8&character_set_system=UTF8"
// , "root"
// , "mysql7760"
//
// );
//
// // retrieve resultSet
// Statement cmd = conn.createStatement();
// cmd.execute("select * from simple_table;");
// ResultSet rdr = cmd.getResultSet();
//
// // get value
// rdr.next();
// String name = rdr.getNString("name");
//
// // output results
// System.out.println("name=" + name + " len=" + name.getBytes().length + " 0=" + name.getBytes()[0]);
// System.out.println("name=" + name + " len=" + name.getBytes("UTF8").length + " 0=" + name.getBytes("UTF8")[0] + " 1=" + name.getBytes("UTF8")[1]);
//
// Charset utf8charset = Charset.forName("UTF-8");
// Charset iso88591charset = Charset.forName("ISO-8859-1");
//
// ByteBuffer inputBuffer = ByteBuffer.wrap(name.getBytes());
//
// // decode ISO-8559-1
// CharBuffer data = iso88591charset.decode(inputBuffer);
//
// // encode UTF-8
// ByteBuffer outputBuffer = utf8charset.encode(data);
// byte[] outputData = outputBuffer.array();
// name = new String(outputData, "UTF-8");
// System.out.println("name=" + name + " len=" + name.getBytes().length + " 0=" + name.getBytes()[0]);
// System.out.println("name=" + name + " len=" + name.getBytes("UTF8").length + " 0=" + name.getBytes("UTF8")[0] + " 1=" + name.getBytes("UTF8")[1]);
// }
//}
/*
Hi all. The topic is pretty straightforward, but I've been staring at it for quite some time.
I'm trying to retrieve non-English characters from a MySQL database in UTF-8. In my example below, I use "à" (U+00E0: Latin Small Letter A With Grave) but I've also tried with random Japanese characters (Hex=E7A798). I'm new to java/jdbc, so I may be missing something basic, but I've searched for quite a while, and not discovered anything.
I've made sure that my database was created in UTF-8, and that my connection is in UTF8. I've tried enabling all jdbc connection string options (see commented line below) and it makes no difference. I've also tried System.setProperty("file.encoding", "UTF-8");.
The odd thing is that somehow the code below works when I run it from a JUnit test. (the actual results match the expected ones)
Any help would be appreciated. Thanks in advance.
[list]SQL to create data[/list]
[code]
CREATE DATABASE bug_utf8 CHARACTER SET utf8 COLLATE utf8_general_ci;
USE bug_utf8;
DROP TABLE IF EXISTS simple_table;
CREATE TABLE simple_table (name varchar(255) NOT NULL);
INSERT INTO simple_table (name) VALUES ('à');
SELECT Hex(name) from simple_table; -- returns C3A0
[/code]
[list]Java code[/list]
[code]
import java.sql.*;
public class Bug_Utf8 {
public static void main(String[] s) throws SQLException, java.io.UnsupportedEncodingException {
// init connection
Connection conn = DriverManager.getConnection
( "jdbc:mysql://localhost/bug_utf8?useUnicode=true&characterEncoding=UTF8"
// ( "jdbc:mysql://localhost/bug_utf8?useUnicode=true&characterEncoding=UTF8&character_set_client=UTF8&character_set_database=UTF8&character_set_results=UTF8&character_set_server=UTF8&character_set_system=UTF8"
, "root"
, "yourpassword"
);
// retrieve resultSet
Statement cmd = conn.createStatement();
cmd.execute("select * from simple_table;");
ResultSet rdr = cmd.getResultSet();
// get value
rdr.next();
String name = rdr.getNString("name");
// output results
// actual
System.out.println("name=" + name + " len=" + name.getBytes().length + " 0=" + name.getBytes()[0]);
// expecting: C3 A0
System.out.println("name=" + name + " len=" + name.getBytes("UTF8").length + " 0=" + name.getBytes("UTF8")[0] + " 1=" + name.getBytes("UTF8")[1]);
}
}
[/code]
[list]Environment Details[/list]
OS: Windows XP SP3
MySQL: Server version: 5.1.40-community MySQL Community Server (GPL)
Java: java version "1.6.0_20"
JDBC: 5.1.12
JUnit: 4_4.5 v20090824
IDE: Eclipse 20090920-1017 (not that it should matter)
*/
import java.sql.*;
public class Bug_Utf8 {
public static void main(String[] args) throws Exception {
Class.forName("org.sqlite.JDBC");
Connection conn =
DriverManager.getConnection("jdbc:sqlite:test.db");
Statement stat = conn.createStatement();
stat.executeUpdate("drop table if exists people;");
stat.executeUpdate("create table people (name, occupation);");
PreparedStatement prep = conn.prepareStatement(
"insert into people values (?, ?);");
prep.setString(1, "Gandhi");
prep.setString(2, "politics");
prep.addBatch();
prep.setString(1, "Turing");
prep.setString(2, "computers");
prep.addBatch();
prep.setString(1, "Wittgenstein");
prep.setString(2, "smartypants");
prep.addBatch();
conn.setAutoCommit(false);
prep.executeBatch();
conn.setAutoCommit(true);
ResultSet rs = stat.executeQuery("select * from people;");
while (rs.next()) {
System.out.println("name = " + rs.getString("name"));
System.out.println("job = " + rs.getString("occupation"));
}
rs.close();
conn.close();
}
}

View File

@@ -0,0 +1,245 @@
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012 gnosygnu@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gplx.dbs;
import gplx.*; import gplx.dbs.engines.sqlite.*;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.sql.*;
public class SqliteDbMain {
public static void main(String[] args) throws Exception {
SqliteDbMain main = new SqliteDbMain();
// main.JdbcInit(args);
// main.Read();
// main.Mass_upload(Io_url_.new_dir_("J:\\gplx\\xowl\\file\\all#meta\\en.wikipedia.org\\"));
// main.CreateMany(20, 0);
main.CreateMany(20, 1000000 + 1);
}// 179,167,161,147,160,165,159
/*
'5281' '189'
'5266' '189'
'5640' '177'
'5719' '174'
'5766' '173'
*/
// private void JdbcInit(String[] args) {
// try {
// Class.forName("SQLite.JDBCDriver");
// }
// catch (Exception e) {
// ConsoleAdp._.WriteLine(e.getMessage());
// }
// }
private void CreateMany(int number, int base_val) {
long time_bgn = Env_.TickCount();
Db_conn provider = Db_conn_pool.I.Get_or_new__sqlite(Io_url_.new_fil_("E:\\test.sqlite3"));
String tbl_sql = String_.Concat_lines_nl
( "CREATE TABLE fsdb_xtn_thm"
, "( thm_id integer NOT NULL PRIMARY KEY"
, ", thm_owner_id integer NOT NULL"
, ", thm_w integer NOT NULL"
, ", thm_h integer NOT NULL"
, ", thm_thumbtime integer NOT NULL"
, ", thm_bin_db_id integer NOT NULL"
, ", thm_size bigint NOT NULL"
, ", thm_modified varchar(14) NOT NULL"
, ", thm_hash varchar(40) NOT NULL"
, ");"
);
Sqlite_engine_.Tbl_create_and_delete(provider, "fsdb_xtn_thm", tbl_sql);
// provider.Txn_mgr().Txn_bgn();
Db_stmt stmt = Db_stmt_.new_insert_(provider, "fsdb_xtn_thm", "thm_id", "thm_owner_id", "thm_w", "thm_h", "thm_thumbtime", "thm_bin_db_id", "thm_size", "thm_modified", "thm_hash");
for (int i = 0; i < number; i++) {
stmt.Clear()
.Val_int(base_val + i)
.Val_int(base_val + i)
.Val_int(220)
.Val_int(200)
.Val_int(-1)
.Val_int(15)
.Val_long(23456)
.Val_str("")
.Val_str("")
.Exec_insert();
}
long time_elapsed = (Env_.TickCount() - time_bgn);
// provider.Txn_mgr().Txn_end();
provider.Rls_conn();
Tfds.Write(time_elapsed, number / time_elapsed);
// 250; 260
Tfds.Write("");
}
Connection conn; PreparedStatement stmt;
void Read() {
try {
Class.forName("org.sqlite.JDBC");
conn = DriverManager.getConnection("jdbc:sqlite:J:\\gplx\\xowl\\file\\all#meta\\en.wikipedia.org\\meta.db");
Statement stat = conn.createStatement();
// stat.executeUpdate("DROP TABLE temp;");
// stat.executeUpdate("CREATE TABLE temp (ttl varchar(1024));");
// PreparedStatement prep = conn.prepareStatement("INSERT INTO temp VALUES (?);");
// conn.setAutoCommit(false);
// prep.setString(1, "Rembrandt auto 1627.jpg"); prep.addBatch();
// prep.setString(1, "Rembrandt van Rijn 184.jpg"); prep.addBatch();
// prep.setString(1, "Rembrandt laughing.jpg"); prep.addBatch();
// prep.setString(1, "Rembrandt van Rijn 199.jpg"); prep.addBatch();
// prep.setString(1, "Rembrandt Harmensz. van Rijn 144.jpg"); prep.addBatch();
// prep.setString(1, "Self-portrait at 34 by Rembrandt.jpg"); prep.addBatch();
// prep.setString(1, "Selfportrait Rembrandt1641.jpg"); prep.addBatch();
// prep.setString(1, "Rembrandt Harmensz. van Rijn 127b.jpg"); prep.addBatch();
// prep.setString(1, "Rembrandt Harmensz. van Rijn 132.jpg"); prep.addBatch();
// prep.setString(1, "Rembrandt - Self Portrait111.jpg"); prep.addBatch();
// prep.setString(1, "Rembrandt self portrait.jpg"); prep.addBatch();
// prep.setString(1, "Rembrant Self-Portrait, 1660.jpg"); prep.addBatch();
// prep.setString(1, "Rembrandt van rijn-self portrait.jpg"); prep.addBatch();
// prep.setString(1, "Rembrandt, Auto-portrait, 1660.jpg"); prep.addBatch();
// prep.setString(1, "Rembrandt van Rijn 142 version 02.jpg"); prep.addBatch();
// prep.setString(1, "Rembrandt Harmensz. van Rijn 135.jpg"); prep.addBatch();
// prep.executeBatch();
// conn.setAutoCommit(true);
// ResultSet rs = stat.executeQuery("SELECT TOP 10 files.* FROM files JOIN temp ON files.ttl = temp.ttl;");
// ResultSet rs = stat.executeQuery("SELECT files.* FROM files LIMIT 100;");
ResultSet rs = stat.executeQuery("SELECT files.* FROM files WHERE ttl IN ('380CHANGI.jpg', '20120523Palmen_Hockenheim1.jpg') ;");
while (rs.next()) {
System.out.println("ttl = " + rs.getString("ttl") + "; orig_w = " + rs.getString("orig_w") + "; orig_h = " + rs.getString("orig_h"));
}
rs.close();
}catch(Exception e) {
Exc_.Noop(e);
}
}
void Index() {
try {
Class.forName("org.sqlite.JDBC");
conn = DriverManager.getConnection("jdbc:sqlite:J:\\gplx\\xowl\\file\\all#meta\\en.wikipedia.org\\meta.db");
Statement stat = conn.createStatement();
stat.executeUpdate("PRAGMA synchronous=OFF");
stat.executeUpdate("PRAGMA count_changes=OFF");
stat.executeUpdate("PRAGMA journal_mode=MEMORY");
stat.executeUpdate("PRAGMA temp_store=MEMORY");
conn.setAutoCommit(false);
stat.executeUpdate("CREATE INDEX files_ndx ON files (ttl);");
conn.commit();
conn.setAutoCommit(true);
}catch(Exception e) {
Exc_.Noop(e);
}
}
void Mass_upload(Io_url dir) {
try {
Class.forName("org.sqlite.JDBC");
conn = DriverManager.getConnection("jdbc:sqlite:J:\\gplx\\xowl\\file\\all#meta\\en.wikipedia.org\\meta.db");
Statement stat = conn.createStatement();
stat.executeUpdate("drop table if exists files;");
String sql = String_.Concat_lines_nl
( "CREATE TABLE files"
, "( ttl varchar(1024)"
, ", redirect varchar(1024)"
, ", ext int"
, ", orig_mode int"
, ", orig_w int"
, ", orig_h int"
, ", thumbs varchar(2048)" // assuming 10 bytes per thumb, roughly 200 thumbs
, ");"
);
stat.executeUpdate(sql);
ConsoleAdp._.WriteLine(DateAdp_.Now().XtoStr_fmt_yyyyMMdd_HHmmss_fff());
// stat.executeUpdate("BEGIN TRANSACTION");
stat.executeUpdate("PRAGMA synchronous=OFF");
stat.executeUpdate("PRAGMA count_changes=OFF");
stat.executeUpdate("PRAGMA journal_mode=MEMORY");
stat.executeUpdate("PRAGMA temp_store=MEMORY");
conn.setAutoCommit(false);
stmt = conn.prepareStatement("insert into files values (?, ?, ?, ?, ?, ?, ?);");
Iterate_dir(dir);
// stat.executeUpdate("COMMIT TRANSACTION");
stmt.executeBatch();
conn.commit();
conn.setAutoCommit(true);
}catch(Exception e) {
Exc_.Noop(e);
}
}
void Iterate_dir(Io_url dir) {
Io_url[] urls = Io_mgr.I.QueryDir_args(dir).DirInclude_().ExecAsUrlAry();
int urls_len = urls.length;
ConsoleAdp._.WriteLine(dir.Raw());
boolean is_root = false;
for (int i = 0; i < urls_len; i++) {
Io_url url = urls[i];
if (url.Type_dir())
Iterate_dir(url);
else {
try {
is_root = true;
Insert_file(url);
}catch(Exception e) {
Exc_.Noop(e);
}
}
}
try {
if (is_root) {
stmt.executeBatch();
stmt.clearBatch();
}
}catch(Exception e) {
Exc_.Noop(e);
}
}
void Insert_file(Io_url url) {
if (String_.EqNot(url.Ext(), ".csv")) return;
String raw = Io_mgr.I.LoadFilStr(url);
String[] lines = String_.SplitLines_nl(raw);
int lines_len = lines.length;
for (int i = 0; i < lines_len; i++) {
String line = lines[i];
Insert_line(line);
}
}
void Insert_line(String line) {
try {
String[] flds = String_.Split(line, '|');
int flds_len = flds.length;
if (flds_len == 0) return;
stmt.setString(1, flds[2]);
if (flds_len == 4)
stmt.setString(2, flds[3]);
if (flds_len > 4) {
stmt.setInt(3, Bry_.new_a7(flds[3])[0] - 32);
byte[] orig = Bry_.new_a7(flds[4]);
int orig_mode = orig[0] - Byte_ascii.Num_0;
int comma_pos = Bry_finder.Find_fwd(orig, Byte_ascii.Comma);
int orig_w = Bry_.Xto_int_or(orig, 2, comma_pos, -1);
int orig_h = Bry_.Xto_int_or(orig, comma_pos + 1, orig.length, -1);
stmt.setInt(4, orig_mode);
stmt.setInt(5, orig_w);
stmt.setInt(6, orig_h);
if (flds_len > 5)
stmt.setString(7, flds[4]);
}
stmt.addBatch();
}catch(Exception e) {
Exc_.Noop(e);
}
}
}