2017-03-12 08:00:52 +00:00
/ *
Given a domain and a category , list its member pages and readability score
2017-03-12 21:24:34 +00:00
EX : http : //xowa.org/wikimedia.html?domain=en.wikipedia.org&category=Earth
2017-03-12 08:00:52 +00:00
* /
( function ( wm ) {
wm . category = new function ( ) {
// **********************************************
// member variables
// **********************************************
2017-03-12 19:38:09 +00:00
// test mode
this . production = true ;
2017-03-12 08:00:52 +00:00
// wikimedia domain; EX: en.wikipedia.org
this . domain = 'en.wikipedia.org' ;
2017-03-12 21:24:34 +00:00
// array of pages
this . pages = [ ] ;
2017-03-12 08:00:52 +00:00
// number of pages in category
2017-03-12 21:24:34 +00:00
this . pagesTotal = 0 ;
2017-03-12 08:00:52 +00:00
// number of excerpts found
this . excerptsFound = 0 ;
// maximum number of excerpts to find
this . excerptsMax = 50 ;
2017-03-12 21:24:34 +00:00
// member variable for category
this . category _title = '' ;
2017-03-12 08:00:52 +00:00
// **********************************************
// main entry function
// **********************************************
this . run = function ( ) {
2017-03-12 16:40:06 +00:00
setTimeout ( function ( ) {
// parse url to get domain and page
var url = window . location . href ;
var domain = wm . category . getQueryArg ( url , 'domain' ) ;
2017-03-12 21:24:34 +00:00
wm . category . category _title = wm . category . getQueryArg ( url , 'category' ) ;
2017-03-12 16:40:06 +00:00
// use domain arg if available; otherwise use default
if ( domain )
wm . category . domain = domain ;
2017-03-12 21:26:52 +00:00
// handle bare url
if ( ! wm . category . category _title )
return ;
2017-03-12 21:24:34 +00:00
// write status
wm . category . writeHtml ( '<div class="header_div">Evaluating Category:' + wm . category . category _title + '. Please wait...</div>' ) ;
2017-03-12 16:40:06 +00:00
// find pages in category
2017-03-12 21:24:34 +00:00
wm . category . findPagesInCategory ( wm . category . domain , wm . category . category _title ) ;
2017-03-12 16:40:06 +00:00
} , 100 ) ;
2017-03-12 08:00:52 +00:00
}
this . getQueryArg = function ( url , name ) {
2017-03-12 16:34:48 +00:00
// REF: http://stackoverflow.com/questions/901115/how-can-i-get-query-string-values-in-javascript
2017-03-12 08:00:52 +00:00
if ( ! url ) {
url = window . location . href ;
}
name = name . replace ( /[\[\]]/g , "\\$&" ) ;
var regex = new RegExp ( "[?&]" + name + "(=([^&#]*)|&|#|$)" ) ,
2017-03-12 21:24:34 +00:00
pages = regex . exec ( url ) ;
if ( ! pages ) return null ;
if ( ! pages [ 2 ] ) return '' ;
return decodeURIComponent ( pages [ 2 ] . replace ( /\+/g , " " ) ) ;
2017-03-12 08:00:52 +00:00
}
// **********************************************
// find page in category
// **********************************************
this . findPagesInCategory = function ( domain , category ) {
2017-03-12 08:28:54 +00:00
// run ajax; NOTE: must specify origin to bypass CORS; http://stackoverflow.com/a/38921370
2017-03-12 19:38:09 +00:00
if ( wm . category . production ) {
var url = 'https://' + domain + '/w/api.php?action=query&format=json&formatversion=2&origin=*&list=categorymembers&cmlimit=' + wm . category . excerptsMax + '&cmtitle=Category:' + category ;
wm . category . runAjax ( url , wm . category . findPagesInCategoryCallback ) ;
}
else {
2017-03-12 21:24:34 +00:00
// var root = {"query":{"categorymembers":[{"pageid":9228,"ns":0,"title":"Earth"},{"pageid":51506837,"ns":0,"title":"Outline of Earth"},{"pageid":25287133,"ns":0,"title":"Anywhere on Earth"},{"pageid":174069,"ns":0,"title":"Asteroid impact avoidance"},{"pageid":35971482,"ns":0,"title":"Day length fluctuations"},{"pageid":33256286,"ns":0,"title":"Demographics of the world"},{"pageid":19509955,"ns":0,"title":"Earth in culture"},{"pageid":212485,"ns":0,"title":"Earth religion"},{"pageid":944638,"ns":0,"title":"Earth's energy budget"},{"pageid":41077022,"ns":0,"title":"Earth's internal heat budget"}]}};
2017-03-12 19:38:09 +00:00
var root = { "query" : { "categorymembers" : [ { "pageid" : 9228 , "ns" : 0 , "title" : "Earth" } ] } } ;
2017-03-12 21:24:34 +00:00
// var root = {"query":{"categorymembers":[]}};
2017-03-12 19:38:09 +00:00
wm . category . findPagesInCategoryCallbackRoot ( root ) ;
}
2017-03-12 08:00:52 +00:00
}
this . findPagesInCategoryCallback = function ( ) {
if ( this . readyState != 4 || this . status != 200 ) return ;
wm . category . findPagesInCategoryCallbackRoot ( JSON . parse ( this . responseText ) ) ;
}
this . findPagesInCategoryCallbackRoot = function ( root ) {
// loop each page in category
var categorymembers = root . query . categorymembers ;
for ( var categoryIndex in categorymembers ) {
// get category
var category = categorymembers [ categoryIndex ] ;
// get member vars
var page _id = category . pageid ;
var ns = category . ns ;
var title = category . title ;
2017-03-12 21:24:34 +00:00
// populate local pages table
wm . category . pages [ page _id ] = category ;
2017-03-12 08:00:52 +00:00
// increment total
2017-03-12 21:24:34 +00:00
wm . category . pagesTotal ++ ;
2017-03-12 08:00:52 +00:00
// assign score
2017-03-12 19:38:09 +00:00
category . score = 'N/A' ;
2017-03-12 08:00:52 +00:00
}
2017-03-12 21:24:34 +00:00
if ( wm . category . pagesTotal == 0 ) {
wm . category . writeHtml ( "No results found for Category:" + wm . category . category _title ) ;
}
else {
// get excerpts
wm . category . getExcerpts ( ) ;
}
2017-03-12 08:00:52 +00:00
}
// **********************************************
// get excerpts
// **********************************************
this . getExcerpts = function ( ) {
// loop each page to get excerpt
var excerptsCount = 0 ;
2017-03-12 21:24:34 +00:00
for ( var page _id in wm . category . pages ) {
var category = wm . category . pages [ page _id ] ;
2017-03-12 08:00:52 +00:00
// exit if too many
if ( excerptsCount ++ >= wm . category . excerptsMax ) {
2017-03-12 21:24:34 +00:00
// NOTE: must update pagesTotal
wm . category . pagesTotal = wm . category . excerptsMax ;
2017-03-12 08:00:52 +00:00
break ;
}
2017-03-12 08:28:54 +00:00
// run ajax; NOTE: must specify origin to bypass CORS; http://stackoverflow.com/a/38921370
2017-03-12 19:38:09 +00:00
if ( wm . category . production ) {
var url = 'https://' + wm . category . domain + '/w/api.php?action=query&format=json&formatversion=2&origin=*&prop=extracts&exintro=1&explaintext&titles=' + category . title ;
wm . category . runAjax ( url , wm . category . getExcerptCallback ) ;
}
else {
var root = { "query" : { "pages" :
[
{ "pageid" : 9228 , "ns" : 0 , "title" : "Earth" , "extract" : "Earth (Greek: Γαῖα Gaia; Latin: Terra)." }
] } } ;
wm . category . getExcerptCallbackRoot ( root ) ;
}
2017-03-12 08:00:52 +00:00
}
}
this . getExcerptCallback = function ( ) {
if ( this . readyState != 4 || this . status != 200 ) return ;
var root = JSON . parse ( this . responseText ) ;
wm . category . getExcerptCallbackRoot ( root ) ;
}
this . getExcerptCallbackRoot = function ( root ) {
// get variables
var page = root . query . pages [ 0 ] ; // only 1 page per api call
var page _id = page . pageid ;
var excerpt = page . extract ;
2017-03-12 16:34:48 +00:00
// calc readability score
2017-03-12 19:38:09 +00:00
var score = wm . category . calcReadabilityScore ( page . title , excerpt ) ;
2017-03-12 08:00:52 +00:00
2017-03-12 16:34:48 +00:00
// update local category
2017-03-12 21:24:34 +00:00
var category = wm . category . pages [ page _id ] ;
2017-03-12 08:00:52 +00:00
category . excerpt = excerpt ;
2017-03-12 19:38:09 +00:00
category . score = score [ 0 ] ;
category . totalSentences = score [ 1 ] ;
category . totalWords = score [ 2 ] ;
category . totalSyllables = score [ 3 ] ;
if ( ! category . score )
console . log ( JSON . stringify ( category ) ) ;
2017-03-12 08:00:52 +00:00
// if last category, print all
2017-03-12 21:24:34 +00:00
if ( ++ wm . category . excerptsFound == wm . category . pagesTotal ) {
2017-03-12 08:00:52 +00:00
wm . category . printResults ( ) ;
}
}
// **********************************************
// calc readability
// **********************************************
2017-03-12 19:38:09 +00:00
this . calcReadabilityScore = function ( title , s ) {
2017-03-12 08:00:52 +00:00
// REF: https://en.wikipedia.org/wiki/Flesch– Kincaid_readability_tests
2017-03-12 16:34:48 +00:00
// count words and sentences
2017-03-12 08:00:52 +00:00
var words = wm . category . toWordArray ( s ) ;
2017-03-12 19:38:09 +00:00
if ( words . length == 1 ) return [ 999 , 0 , 0 , 0 ] ;
2017-03-12 08:00:52 +00:00
var totalWords = words . length ;
var totalSentences = wm . category . countSentences ( s ) ;
2017-03-12 16:34:48 +00:00
// count syllables
2017-03-12 08:00:52 +00:00
var totalSyllables = 0 ;
2017-03-12 19:38:09 +00:00
var wordsLength = words . length ;
for ( var i = 0 ; i < wordsLength ; i ++ ) {
totalSyllables += wm . category . countSyllablesInWord ( words [ i ] ) ;
2017-03-12 08:00:52 +00:00
}
2017-03-12 16:34:48 +00:00
// calc score: again, see https://en.wikipedia.org/wiki/Flesch– Kincaid_readability_tests
2017-03-12 19:38:09 +00:00
var score = 206.835 - ( 1.015 * ( totalWords / totalSentences ) ) - ( 84.6 * ( totalSyllables / totalWords ) ) ;
return [ score , totalSentences , totalWords , totalSyllables ] ;
2017-03-12 08:00:52 +00:00
}
this . toWordArray = function ( s ) {
// REF: http://stackoverflow.com/a/18679657
s = s . replace ( /(^\s*)|(\s*$)/gi , "" ) ; //exclude start and end white-space
s = s . replace ( /[ ]{2,}/gi , " " ) ; //2 or more space to 1
s = s . replace ( /\n / , "\n" ) ; // exclude newline with a start spacing
return s . split ( ' ' ) ;
}
this . countSentences = function ( s ) {
// REF: http://stackoverflow.com/questions/35215348/count-sentences-in-string-with-javascript
2017-03-12 19:38:09 +00:00
var replaced = s . replace ( /\w[.?!](\s|$)/g , "$1|" ) ;
var arr = replaced . split ( "|" ) ;
var arr _len = arr . length ;
var count = 0 ;
for ( var i = 0 ; i < arr _len ; i ++ ) {
var sentence = arr [ i ] ;
sentence = sentence . trim ( ) ; // remove any whitespace
// ignore 0 length sentences; note that "Yes." will become ["Yes", ""] so 2nd needs to be ignored
if ( sentence . length == 0 ) continue ;
// add back acronyms; 5 is a heuristic for maximum length of acronym
if ( sentence . length < 5 ) {
// ignore; NOTE: not handling "Words U.S.A." will break up into ["Words U", "S", "A"];
}
else {
count ++ ;
}
}
return count ;
2017-03-12 08:00:52 +00:00
}
this . countSyllablesInWord = function ( word ) {
// REF: http://stackoverflow.com/questions/5686483/how-to-compute-number-of-syllables-in-a-word-in-javascript
word = word . toLowerCase ( ) ; //word.downcase!
if ( word . length <= 3 ) { return 1 ; } //return 1 if word.length <= 3
word = word . replace ( /(?:[^laeiouy]es|ed|[^laeiouy]e)$/ , '' ) ; //word.sub!(/(?:[^laeiouy]es|ed|[^laeiouy]e)$/, '')
2017-03-12 19:38:09 +00:00
if ( word == null ) return 1 ;
2017-03-12 08:00:52 +00:00
word = word . replace ( /^y/ , '' ) ; //word.sub!(/^y/, '')
2017-03-12 19:38:09 +00:00
if ( word == null ) return 1 ;
word = word . match ( /[aeiouy]{1,2}/g ) ; //word.scan(/[aeiouy]{1,2}/).size
return word == null ? 1 : word . length ;
2017-03-12 08:00:52 +00:00
}
// **********************************************
// printResults
// **********************************************
this . printResults = function ( ) {
// sort results by score
2017-03-12 21:24:34 +00:00
wm . category . pages . sort ( wm . category . compareResult ) ;
2017-03-12 16:34:48 +00:00
2017-03-12 08:00:52 +00:00
// generate string
2017-03-12 16:34:48 +00:00
var s
2017-03-12 21:24:34 +00:00
= '<div class="header_div">\n'
+ ' <div>Readability scores for member pages of ' + wm . category . buildWikiLink ( wm . category . domain , 'Category:' + wm . category . category _title ) + ' in ' + wm . category . domain + '</div>\n'
+ '</div>\n'
+ '<br/>\n'
+ '<div class="results_div">\n'
2017-03-12 16:34:48 +00:00
+ ' <div class="result_div">\n'
+ ' <div class="result_title result_header">Title</div>\n'
+ ' <div class="result_score result_header">Score</div>\n'
+ ' </div>'
;
2017-03-12 21:24:34 +00:00
for ( var page _id in wm . category . pages ) {
var category = wm . category . pages [ page _id ] ;
2017-03-12 16:34:48 +00:00
// get category_title for url
var page _enc = category . title . replace ( / /g , '_' ) ;
page _enc = encodeURI ( page _enc ) ;
2017-03-12 19:38:09 +00:00
// get score
var score = category . score ;
if ( score === 999 ) {
score = 'N/A' ;
}
else {
score = score . toFixed ( 2 ) ;
}
2017-03-12 16:34:48 +00:00
s += ' <div class="result_div">\n'
2017-03-12 21:24:34 +00:00
+ ' <div class="result_title tooltip">' + wm . category . buildWikiLink ( wm . category . domain , category . title ) + '\n'
2017-03-12 19:38:09 +00:00
+ ' <span class="tooltiptext">\n'
+ ' Sentences: ' + category . totalSentences + '<br/>\n'
+ ' Words: ' + category . totalWords + '<br/>\n'
+ ' Syllables: ' + category . totalSyllables + '<br/>\n'
+ ' <br/>\n'
+ category . excerpt
+ ' </span>\n'
2017-03-12 16:34:48 +00:00
+ ' </div>\n'
2017-03-12 19:38:09 +00:00
+ ' <div class="result_score">' + score + '</div>\n'
2017-03-12 16:34:48 +00:00
+ ' </div>\n' ;
2017-03-12 08:00:52 +00:00
}
s += '</div>' ;
2017-03-12 16:34:48 +00:00
// print string
2017-03-12 21:24:34 +00:00
wm . category . writeHtml ( s ) ;
}
this . buildWikiLink = function ( wiki , page ) {
var page _enc = page . replace ( / /g , '_' ) ;
page _enc = encodeURI ( page _enc ) ;
return '<a href="https://' + wiki + '/wiki/' + page _enc + '">' + page + '</a>'
2017-03-12 08:00:52 +00:00
}
this . compareResult = function ( lhs , rhs ) {
// sort from least readable to most readable
return ( lhs . score - rhs . score ) ;
}
// **********************************************
// utility
// **********************************************
this . runAjax = function ( url , callback ) {
2017-03-12 08:28:54 +00:00
var xhr = new XMLHttpRequest ( ) ;
xhr . open ( "GET" , url , true ) ;
xhr . onreadystatechange = callback ;
xhr . send ( ) ;
2017-03-12 08:00:52 +00:00
}
2017-03-12 21:24:34 +00:00
this . writeHtml = function ( html ) {
document . body . innerHTML = html ;
}
2017-03-12 08:00:52 +00:00
}
} ( window . wm = window . wm || { } ) ) ;