2020-10-02 15:10:00 +00:00
|
|
|
/**
|
|
|
|
* A search index for auto-complete suggestions.
|
|
|
|
*
|
|
|
|
* This implementation indexes words, and suggests items based on a best-match score, including
|
|
|
|
* amount of overlap and position of words. It searches case-insensitively and only at the start
|
|
|
|
* of words. E.g. searching for "Blue" would match "Blu" in "Lavender Blush", but searching for
|
|
|
|
* "lush" would only match the "L" in "Lavender".
|
|
|
|
*/
|
|
|
|
|
2022-08-17 16:35:50 +00:00
|
|
|
import {localeCompare, nativeCompare, sortedIndex} from 'app/common/gutil';
|
2020-10-02 15:10:00 +00:00
|
|
|
import {DomContents} from 'grainjs';
|
2021-07-22 22:17:55 +00:00
|
|
|
import escapeRegExp = require("lodash/escapeRegExp");
|
2022-08-17 16:35:50 +00:00
|
|
|
import deburr = require("lodash/deburr");
|
2022-08-26 06:05:05 +00:00
|
|
|
import split = require("lodash/split");
|
2020-10-02 15:10:00 +00:00
|
|
|
|
|
|
|
export interface ACItem {
|
|
|
|
// This should be a trimmed lowercase version of the item's text. It may be an accessor.
|
|
|
|
// Note that items with empty cleanText are never suggested.
|
|
|
|
cleanText: string;
|
|
|
|
}
|
|
|
|
|
2022-08-17 16:35:50 +00:00
|
|
|
// Returns a trimmed, lowercase version of a string,
|
|
|
|
// from which accents and other diacritics have been removed,
|
|
|
|
// so that autocomplete is case- and accent-insensitive.
|
2022-08-17 16:40:46 +00:00
|
|
|
export function normalizeText(text: string): string {
|
2022-08-17 16:35:50 +00:00
|
|
|
return deburr(text).trim().toLowerCase();
|
2022-08-09 13:43:44 +00:00
|
|
|
}
|
|
|
|
|
2020-10-02 15:10:00 +00:00
|
|
|
// Regexp used to split text into words; includes nearly all punctuation. This means that
|
|
|
|
// "foo-bar" may be searched by "bar", but it's impossible to search for punctuation itself (e.g.
|
|
|
|
// "a-b" and "a+b" are not distinguished). (It's easy to exclude unicode punctuation too if the
|
|
|
|
// need arises, see https://stackoverflow.com/a/25575009/328565).
|
|
|
|
const wordSepRegexp = /[\s!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~]+/;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* An auto-complete index, which simply allows searching for a string.
|
|
|
|
*/
|
|
|
|
export interface ACIndex<Item extends ACItem> {
|
|
|
|
search(searchText: string): ACResults<Item>;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Splits text into an array of pieces, with odd-indexed pieces being the ones to highlight.
|
|
|
|
export type HighlightFunc = (text: string) => string[];
|
|
|
|
|
|
|
|
export const highlightNone: HighlightFunc = (text) => [text];
|
|
|
|
|
|
|
|
/**
|
|
|
|
* AutoComplete results include the suggested items, which one to highlight, and a function for
|
|
|
|
* highlighting the matched portion of each item.
|
|
|
|
*/
|
|
|
|
export interface ACResults<Item extends ACItem> {
|
|
|
|
// Matching items in order from best match to worst.
|
|
|
|
items: Item[];
|
|
|
|
|
|
|
|
// May be used to highlight matches using buildHighlightedDom().
|
|
|
|
highlightFunc: HighlightFunc;
|
|
|
|
|
|
|
|
// index of a good match (normally 0), or -1 if no great match
|
|
|
|
selectIndex: number;
|
|
|
|
}
|
|
|
|
|
|
|
|
interface Word {
|
|
|
|
word: string; // The indexed word
|
|
|
|
index: number; // Index into _allItems for the item containing this word.
|
|
|
|
pos: number; // Position of the word within the item where it occurred.
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Implements a search index. It doesn't currently support updates; when any values change, the
|
|
|
|
* index needs to be rebuilt from scratch.
|
|
|
|
*/
|
|
|
|
export class ACIndexImpl<Item extends ACItem> implements ACIndex<Item> {
|
|
|
|
private _allItems: Item[];
|
|
|
|
|
|
|
|
// All words from _allItems, sorted.
|
|
|
|
private _words: Word[];
|
|
|
|
|
|
|
|
// Creates an index for the given list of items.
|
|
|
|
// The max number of items to suggest may be set using _maxResults (default is 50).
|
2021-07-22 22:17:55 +00:00
|
|
|
// If _keepOrder is true, best matches will be suggested in the order they occur in items,
|
|
|
|
// rather than order by best score.
|
|
|
|
constructor(items: Item[], private _maxResults: number = 50, private _keepOrder = false) {
|
2020-10-02 15:10:00 +00:00
|
|
|
this._allItems = items.slice(0);
|
|
|
|
|
|
|
|
// Collects [word, occurrence, position] tuples for all words in _allItems.
|
|
|
|
const allWords: Word[] = [];
|
|
|
|
for (let index = 0; index < this._allItems.length; index++) {
|
|
|
|
const item = this._allItems[index];
|
|
|
|
const words = item.cleanText.split(wordSepRegexp).filter(w => w);
|
|
|
|
for (let pos = 0; pos < words.length; pos++) {
|
|
|
|
allWords.push({word: words[pos], index, pos});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-03-13 02:25:44 +00:00
|
|
|
allWords.sort((a, b) => localeCompare(a.word, b.word));
|
2020-10-02 15:10:00 +00:00
|
|
|
this._words = allWords;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// The main search function. SearchText will be cleaned (trimmed and lowercased) at the start.
|
|
|
|
// Empty search text returns the first N items in the search universe.
|
|
|
|
public search(searchText: string): ACResults<Item> {
|
2022-08-17 16:40:46 +00:00
|
|
|
const cleanedSearchText = normalizeText(searchText);
|
2020-10-02 15:10:00 +00:00
|
|
|
const searchWords = cleanedSearchText.split(wordSepRegexp).filter(w => w);
|
|
|
|
|
|
|
|
// Maps item index in _allItems to its score.
|
|
|
|
const myMatches = new Map<number, number>();
|
|
|
|
|
|
|
|
if (searchWords.length > 0) {
|
|
|
|
// For each of searchWords, go through items with an overlap, and update their scores.
|
|
|
|
for (let k = 0; k < searchWords.length; k++) {
|
|
|
|
const searchWord = searchWords[k];
|
|
|
|
for (const [itemIndex, score] of this._findOverlaps(searchWord, k)) {
|
|
|
|
myMatches.set(itemIndex, (myMatches.get(itemIndex) || 0) + score);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Give an extra point to items that start with the searchText.
|
|
|
|
for (const [itemIndex, score] of myMatches) {
|
|
|
|
if (this._allItems[itemIndex].cleanText.startsWith(cleanedSearchText)) {
|
|
|
|
myMatches.set(itemIndex, score + 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Array of pairs [itemIndex, score], sorted by score (desc) and itemIndex.
|
|
|
|
const sortedMatches = Array.from(myMatches)
|
|
|
|
.sort((a, b) => nativeCompare(b[1], a[1]) || nativeCompare(a[0], b[0]))
|
|
|
|
.slice(0, this._maxResults);
|
|
|
|
|
2021-07-22 22:17:55 +00:00
|
|
|
const itemIndices: number[] = sortedMatches.map(([index, score]) => index);
|
2020-10-02 15:10:00 +00:00
|
|
|
|
2021-07-22 22:17:55 +00:00
|
|
|
// Append enough non-matching indices to reach maxResults.
|
|
|
|
for (let i = 0; i < this._allItems.length && itemIndices.length < this._maxResults; i++) {
|
2020-10-02 15:10:00 +00:00
|
|
|
if (this._allItems[i].cleanText && !myMatches.has(i)) {
|
2021-07-22 22:17:55 +00:00
|
|
|
itemIndices.push(i);
|
2020-10-02 15:10:00 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-07-22 22:17:55 +00:00
|
|
|
if (this._keepOrder) {
|
|
|
|
itemIndices.sort(nativeCompare);
|
|
|
|
}
|
|
|
|
const items = itemIndices.map(index => this._allItems[index]);
|
|
|
|
|
2020-10-02 15:10:00 +00:00
|
|
|
if (!cleanedSearchText) {
|
|
|
|
// In this case we are just returning the first few items.
|
|
|
|
return {items, highlightFunc: highlightNone, selectIndex: -1};
|
|
|
|
}
|
|
|
|
|
|
|
|
const highlightFunc = highlightMatches.bind(null, searchWords);
|
|
|
|
|
2021-07-22 22:17:55 +00:00
|
|
|
// If we have a best match, and any word in it actually starts with the search text, report it
|
|
|
|
// as a default selection for highlighting. Otherwise, no item will be auto-selected.
|
|
|
|
let selectIndex = sortedMatches.length > 0 ? itemIndices.indexOf(sortedMatches[0][0]) : -1;
|
|
|
|
if (selectIndex >= 0 && !startsWithText(items[selectIndex], cleanedSearchText, searchWords)) {
|
|
|
|
selectIndex = -1;
|
2020-10-02 15:10:00 +00:00
|
|
|
}
|
|
|
|
return {items, highlightFunc, selectIndex};
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Given one of the search words, looks it up in the indexed list of words and searches up and
|
|
|
|
* down the list for all words that share a prefix with it. Each such word contributes something
|
|
|
|
* to the score of the index entry it is a part of.
|
|
|
|
*
|
|
|
|
* Returns a Map from the index entry (index into _allItems) to the score which this searchWord
|
|
|
|
* contributes to it.
|
|
|
|
*
|
|
|
|
* The searchWordPos argument is the position of searchWord in the overall search text (e.g. 0
|
|
|
|
* if it's the first word). It is used for the position bonus, to give higher scores to entries
|
|
|
|
* whose words occur in the same order as in the search text.
|
|
|
|
*/
|
|
|
|
private _findOverlaps(searchWord: string, searchWordPos: number): Map<number, number> {
|
|
|
|
const insertIndex = sortedIndex<{word: string}>(this._words, {word: searchWord},
|
2021-10-04 22:52:03 +00:00
|
|
|
(a, b) => localeCompare(a.word, b.word));
|
2020-10-02 15:10:00 +00:00
|
|
|
|
|
|
|
// Maps index of item to its score.
|
|
|
|
const scored = new Map<number, number>();
|
|
|
|
|
|
|
|
// Search up and down the list, accepting smaller and smaller overlap.
|
|
|
|
for (const step of [1, -1]) {
|
|
|
|
let prefix = searchWord;
|
|
|
|
let index = insertIndex + (step > 0 ? 0 : -1);
|
|
|
|
while (prefix && index >= 0 && index < this._words.length) {
|
|
|
|
for ( ; index >= 0 && index < this._words.length; index += step) {
|
|
|
|
const wordEntry = this._words[index];
|
|
|
|
// Once we reach a word that doesn't start with our prefix, break this loop, so we can
|
|
|
|
// reduce the length of the prefix and keep scanning.
|
|
|
|
if (!wordEntry.word.startsWith(prefix)) { break; }
|
|
|
|
|
|
|
|
// The contribution of this word's to the score consists primarily of the length of
|
|
|
|
// overlap (i.e. length for the current prefix).
|
|
|
|
const baseScore = prefix.length;
|
|
|
|
|
|
|
|
// To this we add 1 if the word matches exactly.
|
|
|
|
const fullWordBonus = (wordEntry.word === searchWord ? 1 : 0);
|
|
|
|
|
|
|
|
// To prefer matches where words occur in the same order as searched (e.g. searching for
|
|
|
|
// "Foo B" should prefer "Foo Bar" over "Bar Foo"), we give a bonus based on the
|
|
|
|
// position of the word in the search text and the entry text. (If positions match as
|
|
|
|
// 0:0 and 1:1, the total position bonus is 2^0+2^(-2)=1.25; while the bonus from 0:1
|
|
|
|
// and 1:0 would be 2^(-1) + 2^(-1)=1.0.)
|
|
|
|
const positionBonus = Math.pow(2, -(searchWordPos + wordEntry.pos));
|
|
|
|
|
|
|
|
const itemScore = baseScore + fullWordBonus + positionBonus;
|
|
|
|
// Each search word contributes only one score (e.g. a search for "Foo" will partially
|
|
|
|
// match both words in "forty five", but only the higher of the matches will count).
|
|
|
|
if (itemScore >= (scored.get(wordEntry.index) || 0)) {
|
|
|
|
scored.set(wordEntry.index, itemScore);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
prefix = prefix.slice(0, -1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return scored;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
export type BuildHighlightFunc = (match: string) => DomContents;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Converts text to DOM with matching bits of text rendered using highlight(match) function.
|
|
|
|
*/
|
|
|
|
export function buildHighlightedDom(
|
|
|
|
text: string, highlightFunc: HighlightFunc, highlight: BuildHighlightFunc
|
|
|
|
): DomContents {
|
|
|
|
if (!text) { return text; }
|
|
|
|
const parts = highlightFunc(text);
|
|
|
|
return parts.map((part, k) => k % 2 ? highlight(part) : part);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Same as wordSepRegexp, but with capturing parentheses.
|
|
|
|
const wordSepRegexpParen = new RegExp(`(${wordSepRegexp.source})`);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Splits text into pieces, with odd-numbered pieces the ones matching a prefix of some
|
|
|
|
* searchWord, i.e. the ones to highlight.
|
|
|
|
*/
|
|
|
|
function highlightMatches(searchWords: string[], text: string): string[] {
|
|
|
|
const textParts = text.split(wordSepRegexpParen);
|
|
|
|
const outputs = [''];
|
|
|
|
for (let i = 0; i < textParts.length; i += 2) {
|
|
|
|
const word = textParts[i];
|
|
|
|
const separator = textParts[i + 1] || '';
|
2022-08-26 06:05:05 +00:00
|
|
|
// deburr (remove diacritics) was used to produce searchWords, so `word` needs to match that.
|
|
|
|
const prefixLen = findLongestPrefixLen(deburr(word).toLowerCase(), searchWords);
|
2020-10-02 15:10:00 +00:00
|
|
|
if (prefixLen === 0) {
|
|
|
|
outputs[outputs.length - 1] += word + separator;
|
|
|
|
} else {
|
2022-08-26 06:05:05 +00:00
|
|
|
// Split into unicode 'characters' that keep diacritics combined
|
|
|
|
const chars = split(word, '');
|
|
|
|
outputs.push(
|
|
|
|
chars.slice(0, prefixLen).join(''),
|
|
|
|
chars.slice(prefixLen).join('') + separator
|
|
|
|
);
|
2020-10-02 15:10:00 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return outputs;
|
|
|
|
}
|
|
|
|
|
|
|
|
function findLongestPrefixLen(text: string, choices: string[]): number {
|
|
|
|
return choices.reduce((max, choice) => Math.max(max, findCommonPrefixLength(text, choice)), 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
function findCommonPrefixLength(text1: string, text2: string): number {
|
|
|
|
let i = 0;
|
|
|
|
while (i < text1.length && text1[i] === text2[i]) { ++i; }
|
|
|
|
return i;
|
|
|
|
}
|
2021-07-15 15:50:28 +00:00
|
|
|
|
|
|
|
/**
|
2021-07-22 22:17:55 +00:00
|
|
|
* Checks whether `item` starts with `text`, or whether all words of text are prefixes of the
|
|
|
|
* words of `item`. (E.g. it would return true if item is "New York", and text is "ne yo".)
|
2021-07-15 15:50:28 +00:00
|
|
|
*/
|
2021-07-22 22:17:55 +00:00
|
|
|
function startsWithText(item: ACItem, text: string, searchWords: string[]): boolean {
|
2021-07-15 15:50:28 +00:00
|
|
|
if (item.cleanText.startsWith(text)) { return true; }
|
|
|
|
|
2021-07-22 22:17:55 +00:00
|
|
|
const regexp = new RegExp(searchWords.map(w => `\\b` + escapeRegExp(w)).join('.*'));
|
|
|
|
const cleanText = item.cleanText.split(wordSepRegexp).join(' ');
|
|
|
|
return regexp.test(cleanText);
|
2021-07-15 15:50:28 +00:00
|
|
|
}
|