/* Given a domain and a category, list its member pages and readability score EX: http://xowa.org/wikimedia.html?domain=en.wikipedia.org&category=Earth */ (function (wm) { wm.category = new function() { // ********************************************** // member variables // ********************************************** // test mode this.production = true; // wikimedia domain; EX: en.wikipedia.org this.domain = 'en.wikipedia.org'; // array of pages this.pages = []; // number of pages in category this.pagesTotal = 0; // number of excerpts found this.excerptsFound = 0; // maximum number of excerpts to find this.excerptsMax = 50; // member variable for category this.category_title = ''; // ********************************************** // main entry function // ********************************************** this.run = function() { setTimeout(function() { // parse url to get domain and page var url = window.location.href; var domain = wm.category.getQueryArg(url, 'domain'); wm.category.category_title = wm.category.getQueryArg(url, 'category'); // use domain arg if available; otherwise use default if (domain) wm.category.domain = domain; // handle bare url if (!wm.category.category_title) return; // write status wm.category.writeHtml('
Evaluating Category:' + wm.category.category_title + '. Please wait...
'); // find pages in category wm.category.findPagesInCategory(wm.category.domain, wm.category.category_title); }, 100); } this.getQueryArg = function(url, name) { // REF: http://stackoverflow.com/questions/901115/how-can-i-get-query-string-values-in-javascript if (!url) { url = window.location.href; } name = name.replace(/[\[\]]/g, "\\$&"); var regex = new RegExp("[?&]" + name + "(=([^&#]*)|&|#|$)"), pages = regex.exec(url); if (!pages) return null; if (!pages[2]) return ''; return decodeURIComponent(pages[2].replace(/\+/g, " ")); } // ********************************************** // find page in category // ********************************************** this.findPagesInCategory = function(domain, category) { // run ajax; NOTE: must specify origin to bypass CORS; http://stackoverflow.com/a/38921370 if (wm.category.production) { var url = 'https://' + domain + '/w/api.php?action=query&format=json&formatversion=2&origin=*&list=categorymembers&cmlimit=' + wm.category.excerptsMax + '&cmtitle=Category:' + category; wm.category.runAjax(url, wm.category.findPagesInCategoryCallback); } else { // var root = {"query":{"categorymembers":[{"pageid":9228,"ns":0,"title":"Earth"},{"pageid":51506837,"ns":0,"title":"Outline of Earth"},{"pageid":25287133,"ns":0,"title":"Anywhere on Earth"},{"pageid":174069,"ns":0,"title":"Asteroid impact avoidance"},{"pageid":35971482,"ns":0,"title":"Day length fluctuations"},{"pageid":33256286,"ns":0,"title":"Demographics of the world"},{"pageid":19509955,"ns":0,"title":"Earth in culture"},{"pageid":212485,"ns":0,"title":"Earth religion"},{"pageid":944638,"ns":0,"title":"Earth's energy budget"},{"pageid":41077022,"ns":0,"title":"Earth's internal heat budget"}]}}; var root = {"query":{"categorymembers":[{"pageid":9228,"ns":0,"title":"Earth"}]}}; // var root = {"query":{"categorymembers":[]}}; wm.category.findPagesInCategoryCallbackRoot(root); } } this.findPagesInCategoryCallback = function() { if (this.readyState != 4 || this.status != 200) return; wm.category.findPagesInCategoryCallbackRoot(JSON.parse(this.responseText)); } this.findPagesInCategoryCallbackRoot = function(root) { // loop each page in category var categorymembers = root.query.categorymembers; for (var categoryIndex in categorymembers) { // get category var category = categorymembers[categoryIndex]; // get member vars var page_id = category.pageid; var ns = category.ns; var title = category.title; // populate local pages table wm.category.pages[page_id] = category; // increment total wm.category.pagesTotal++; // assign score category.score = 'N/A'; } if (wm.category.pagesTotal == 0) { wm.category.writeHtml("No results found for Category:" + wm.category.category_title); } else { // get excerpts wm.category.getExcerpts(); } } // ********************************************** // get excerpts // ********************************************** this.getExcerpts = function() { // loop each page to get excerpt var excerptsCount = 0; for (var page_id in wm.category.pages) { var category = wm.category.pages[page_id]; // exit if too many if (excerptsCount++ >= wm.category.excerptsMax) { // NOTE: must update pagesTotal wm.category.pagesTotal = wm.category.excerptsMax; break; } // run ajax; NOTE: must specify origin to bypass CORS; http://stackoverflow.com/a/38921370 if (wm.category.production) { var url = 'https://' + wm.category.domain + '/w/api.php?action=query&format=json&formatversion=2&origin=*&prop=extracts&exintro=1&explaintext&titles=' + category.title; wm.category.runAjax(url, wm.category.getExcerptCallback); } else { var root = {"query":{"pages": [ {"pageid":9228,"ns":0,"title":"Earth","extract":"Earth (Greek: Γαῖα Gaia; Latin: Terra)."} ]}}; wm.category.getExcerptCallbackRoot(root); } } } this.getExcerptCallback = function() { if (this.readyState != 4 || this.status != 200) return; var root = JSON.parse(this.responseText); wm.category.getExcerptCallbackRoot(root); } this.getExcerptCallbackRoot = function(root) { // get variables var page = root.query.pages[0]; // only 1 page per api call var page_id = page.pageid; var excerpt = page.extract; // calc readability score var score = wm.category.calcReadabilityScore(page.title, excerpt); // update local category var category = wm.category.pages[page_id]; category.excerpt = excerpt; category.score = score[0]; category.totalSentences = score[1]; category.totalWords = score[2]; category.totalSyllables = score[3]; if (!category.score) console.log(JSON.stringify(category)); // if last category, print all if (++wm.category.excerptsFound == wm.category.pagesTotal) { wm.category.printResults(); } } // ********************************************** // calc readability // ********************************************** this.calcReadabilityScore = function(title, s) { // REF: https://en.wikipedia.org/wiki/Flesch–Kincaid_readability_tests // count words and sentences var words = wm.category.toWordArray(s); if (words.length == 1) return [999, 0, 0, 0]; var totalWords = words.length; var totalSentences = wm.category.countSentences(s); // count syllables var totalSyllables = 0; var wordsLength = words.length; for (var i = 0; i < wordsLength; i++) { totalSyllables += wm.category.countSyllablesInWord(words[i]); } // calc score: again, see https://en.wikipedia.org/wiki/Flesch–Kincaid_readability_tests var score = 206.835 - (1.015 * (totalWords / totalSentences)) - (84.6 * (totalSyllables / totalWords)); return [score, totalSentences, totalWords, totalSyllables]; } this.toWordArray = function(s){ // REF: http://stackoverflow.com/a/18679657 s = s.replace(/(^\s*)|(\s*$)/gi,"");//exclude start and end white-space s = s.replace(/[ ]{2,}/gi," ");//2 or more space to 1 s = s.replace(/\n /,"\n"); // exclude newline with a start spacing return s.split(' '); } this.countSentences = function(s) { // REF: http://stackoverflow.com/questions/35215348/count-sentences-in-string-with-javascript var replaced = s.replace(/\w[.?!](\s|$)/g, "$1|"); var arr = replaced.split("|"); var arr_len = arr.length; var count = 0; for (var i = 0; i < arr_len; i++) { var sentence = arr[i]; sentence = sentence.trim(); // remove any whitespace // ignore 0 length sentences; note that "Yes." will become ["Yes", ""] so 2nd needs to be ignored if (sentence.length == 0) continue; // add back acronyms; 5 is a heuristic for maximum length of acronym if (sentence.length < 5) { // ignore; NOTE: not handling "Words U.S.A." will break up into ["Words U", "S", "A"]; } else { count++; } } return count; } this.countSyllablesInWord = function(word) { // REF: http://stackoverflow.com/questions/5686483/how-to-compute-number-of-syllables-in-a-word-in-javascript word = word.toLowerCase(); //word.downcase! if(word.length <= 3) {return 1;} //return 1 if word.length <= 3 word = word.replace(/(?:[^laeiouy]es|ed|[^laeiouy]e)$/, ''); //word.sub!(/(?:[^laeiouy]es|ed|[^laeiouy]e)$/, '') if (word == null) return 1; word = word.replace(/^y/, ''); //word.sub!(/^y/, '') if (word == null) return 1; word = word.match(/[aeiouy]{1,2}/g); //word.scan(/[aeiouy]{1,2}/).size return word == null ? 1 : word.length; } // ********************************************** // printResults // ********************************************** this.printResults = function() { // sort results by score wm.category.pages.sort(wm.category.compareResult); // generate string var s = '
\n' + '
Readability scores for member pages of ' + wm.category.buildWikiLink(wm.category.domain, 'Category:' + wm.category.category_title) + ' in ' + wm.category.domain + '
\n' + '
\n' + '
\n' + '
\n' + '
\n' + '
Title
\n' + '
Score
\n' + '
' ; for (var page_id in wm.category.pages) { var category = wm.category.pages[page_id]; // get category_title for url var page_enc = category.title.replace(/ /g, '_'); page_enc = encodeURI(page_enc); // get score var score = category.score; if (score === 999) { score = 'N/A'; } else { score = score.toFixed(2); } s += '
\n' + '
' + wm.category.buildWikiLink(wm.category.domain, category.title) + '\n' + ' \n' + ' Sentences: ' + category.totalSentences + '
\n' + ' Words: ' + category.totalWords + '
\n' + ' Syllables: ' + category.totalSyllables + '
\n' + '
\n' + category.excerpt + '
\n' + '
\n' + '
' + score + '
\n' + '
\n'; } s += '
'; // print string wm.category.writeHtml(s); } this.buildWikiLink = function(wiki, page) { var page_enc = page.replace(/ /g, '_'); page_enc = encodeURI(page_enc); return '' + page + '' } this.compareResult = function(lhs, rhs) { // sort from least readable to most readable return (lhs.score - rhs.score); } // ********************************************** // utility // ********************************************** this.runAjax = function(url, callback) { var xhr = new XMLHttpRequest(); xhr.open("GET", url, true); xhr.onreadystatechange = callback; xhr.send(); } this.writeHtml = function(html) { document.body.innerHTML = html; } } }(window.wm = window.wm || {}));