gristlabs_grist-core/app/client/lib/ACIndex.ts
Dmitry S 6e844a2e76 (core) Use unicode-aware comparisons for user-visible strings.
Summary:
- Switch code that compares user strings to use localeCompare() based on Intl.Collator.
- Use en-US locale for now. (Ideally should be a document property.)
- Note that with this change, sorting is also becoming case-insensitive (which
  seems an improvement)

- Updated a sorted test fixture
- Updated a browser test with lots of unicode to expect different order.
- Added a bit of unicode to test ordering in Reference autocomplete dropdown.

Test Plan: Fixed / updated tests

Reviewers: paulfitz

Reviewed By: paulfitz

Differential Revision: https://phab.getgrist.com/D2758
2021-03-15 09:54:10 -04:00

251 lines
10 KiB
TypeScript

/**
* A search index for auto-complete suggestions.
*
* This implementation indexes words, and suggests items based on a best-match score, including
* amount of overlap and position of words. It searches case-insensitively and only at the start
* of words. E.g. searching for "Blue" would match "Blu" in "Lavender Blush", but searching for
* "lush" would only match the "L" in "Lavender".
*/
import {localeCompare, nativeCompare, sortedIndex} from 'app/common/gutil';
import {DomContents} from 'grainjs';
export interface ACItem {
// This should be a trimmed lowercase version of the item's text. It may be an accessor.
// Note that items with empty cleanText are never suggested.
cleanText: string;
}
// Regexp used to split text into words; includes nearly all punctuation. This means that
// "foo-bar" may be searched by "bar", but it's impossible to search for punctuation itself (e.g.
// "a-b" and "a+b" are not distinguished). (It's easy to exclude unicode punctuation too if the
// need arises, see https://stackoverflow.com/a/25575009/328565).
const wordSepRegexp = /[\s!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~]+/;
/**
* An auto-complete index, which simply allows searching for a string.
*/
export interface ACIndex<Item extends ACItem> {
search(searchText: string): ACResults<Item>;
}
// Splits text into an array of pieces, with odd-indexed pieces being the ones to highlight.
export type HighlightFunc = (text: string) => string[];
export const highlightNone: HighlightFunc = (text) => [text];
/**
* AutoComplete results include the suggested items, which one to highlight, and a function for
* highlighting the matched portion of each item.
*/
export interface ACResults<Item extends ACItem> {
// Matching items in order from best match to worst.
items: Item[];
// May be used to highlight matches using buildHighlightedDom().
highlightFunc: HighlightFunc;
// index of a good match (normally 0), or -1 if no great match
selectIndex: number;
}
interface Word {
word: string; // The indexed word
index: number; // Index into _allItems for the item containing this word.
pos: number; // Position of the word within the item where it occurred.
}
/**
* Implements a search index. It doesn't currently support updates; when any values change, the
* index needs to be rebuilt from scratch.
*/
export class ACIndexImpl<Item extends ACItem> implements ACIndex<Item> {
private _allItems: Item[];
// All words from _allItems, sorted.
private _words: Word[];
// Creates an index for the given list of items.
// The max number of items to suggest may be set using _maxResults (default is 50).
constructor(items: Item[], private _maxResults: number = 50) {
this._allItems = items.slice(0);
// Collects [word, occurrence, position] tuples for all words in _allItems.
const allWords: Word[] = [];
for (let index = 0; index < this._allItems.length; index++) {
const item = this._allItems[index];
const words = item.cleanText.split(wordSepRegexp).filter(w => w);
for (let pos = 0; pos < words.length; pos++) {
allWords.push({word: words[pos], index, pos});
}
}
allWords.sort((a, b) => localeCompare(a.word, b.word));
this._words = allWords;
}
// The main search function. SearchText will be cleaned (trimmed and lowercased) at the start.
// Empty search text returns the first N items in the search universe.
public search(searchText: string): ACResults<Item> {
const cleanedSearchText = searchText.trim().toLowerCase();
const searchWords = cleanedSearchText.split(wordSepRegexp).filter(w => w);
// Maps item index in _allItems to its score.
const myMatches = new Map<number, number>();
if (searchWords.length > 0) {
// For each of searchWords, go through items with an overlap, and update their scores.
for (let k = 0; k < searchWords.length; k++) {
const searchWord = searchWords[k];
for (const [itemIndex, score] of this._findOverlaps(searchWord, k)) {
myMatches.set(itemIndex, (myMatches.get(itemIndex) || 0) + score);
}
}
// Give an extra point to items that start with the searchText.
for (const [itemIndex, score] of myMatches) {
if (this._allItems[itemIndex].cleanText.startsWith(cleanedSearchText)) {
myMatches.set(itemIndex, score + 1);
}
}
}
// Array of pairs [itemIndex, score], sorted by score (desc) and itemIndex.
const sortedMatches = Array.from(myMatches)
.sort((a, b) => nativeCompare(b[1], a[1]) || nativeCompare(a[0], b[0]))
.slice(0, this._maxResults);
const items: Item[] = sortedMatches.map(([index, score]) => this._allItems[index]);
// Append enough non-matching items to reach maxResults.
for (let i = 0; i < this._allItems.length && items.length < this._maxResults; i++) {
if (this._allItems[i].cleanText && !myMatches.has(i)) {
items.push(this._allItems[i]);
}
}
if (!cleanedSearchText) {
// In this case we are just returning the first few items.
return {items, highlightFunc: highlightNone, selectIndex: -1};
}
const highlightFunc = highlightMatches.bind(null, searchWords);
// The best match is the first item. If it actually starts with the search text, AND has a
// strictly better score than other items, highlight it as a default selection. Otherwise, no
// item will be auto-selected.
let selectIndex = -1;
if (items.length > 0 && items[0].cleanText.startsWith(cleanedSearchText) &&
(sortedMatches.length <= 1 || sortedMatches[1][1] < sortedMatches[0][1])) {
selectIndex = 0;
}
return {items, highlightFunc, selectIndex};
}
/**
* Given one of the search words, looks it up in the indexed list of words and searches up and
* down the list for all words that share a prefix with it. Each such word contributes something
* to the score of the index entry it is a part of.
*
* Returns a Map from the index entry (index into _allItems) to the score which this searchWord
* contributes to it.
*
* The searchWordPos argument is the position of searchWord in the overall search text (e.g. 0
* if it's the first word). It is used for the position bonus, to give higher scores to entries
* whose words occur in the same order as in the search text.
*/
private _findOverlaps(searchWord: string, searchWordPos: number): Map<number, number> {
const insertIndex = sortedIndex<{word: string}>(this._words, {word: searchWord},
(a, b) => nativeCompare(a.word, b.word));
// Maps index of item to its score.
const scored = new Map<number, number>();
// Search up and down the list, accepting smaller and smaller overlap.
for (const step of [1, -1]) {
let prefix = searchWord;
let index = insertIndex + (step > 0 ? 0 : -1);
while (prefix && index >= 0 && index < this._words.length) {
for ( ; index >= 0 && index < this._words.length; index += step) {
const wordEntry = this._words[index];
// Once we reach a word that doesn't start with our prefix, break this loop, so we can
// reduce the length of the prefix and keep scanning.
if (!wordEntry.word.startsWith(prefix)) { break; }
// The contribution of this word's to the score consists primarily of the length of
// overlap (i.e. length for the current prefix).
const baseScore = prefix.length;
// To this we add 1 if the word matches exactly.
const fullWordBonus = (wordEntry.word === searchWord ? 1 : 0);
// To prefer matches where words occur in the same order as searched (e.g. searching for
// "Foo B" should prefer "Foo Bar" over "Bar Foo"), we give a bonus based on the
// position of the word in the search text and the entry text. (If positions match as
// 0:0 and 1:1, the total position bonus is 2^0+2^(-2)=1.25; while the bonus from 0:1
// and 1:0 would be 2^(-1) + 2^(-1)=1.0.)
const positionBonus = Math.pow(2, -(searchWordPos + wordEntry.pos));
const itemScore = baseScore + fullWordBonus + positionBonus;
// Each search word contributes only one score (e.g. a search for "Foo" will partially
// match both words in "forty five", but only the higher of the matches will count).
if (itemScore >= (scored.get(wordEntry.index) || 0)) {
scored.set(wordEntry.index, itemScore);
}
}
prefix = prefix.slice(0, -1);
}
}
return scored;
}
}
export type BuildHighlightFunc = (match: string) => DomContents;
/**
* Converts text to DOM with matching bits of text rendered using highlight(match) function.
*/
export function buildHighlightedDom(
text: string, highlightFunc: HighlightFunc, highlight: BuildHighlightFunc
): DomContents {
if (!text) { return text; }
const parts = highlightFunc(text);
return parts.map((part, k) => k % 2 ? highlight(part) : part);
}
// Same as wordSepRegexp, but with capturing parentheses.
const wordSepRegexpParen = new RegExp(`(${wordSepRegexp.source})`);
/**
* Splits text into pieces, with odd-numbered pieces the ones matching a prefix of some
* searchWord, i.e. the ones to highlight.
*/
function highlightMatches(searchWords: string[], text: string): string[] {
const textParts = text.split(wordSepRegexpParen);
const outputs = [''];
for (let i = 0; i < textParts.length; i += 2) {
const word = textParts[i];
const separator = textParts[i + 1] || '';
const prefixLen = findLongestPrefixLen(word.toLowerCase(), searchWords);
if (prefixLen === 0) {
outputs[outputs.length - 1] += word + separator;
} else {
outputs.push(word.slice(0, prefixLen), word.slice(prefixLen) + separator);
}
}
return outputs;
}
function findLongestPrefixLen(text: string, choices: string[]): number {
return choices.reduce((max, choice) => Math.max(max, findCommonPrefixLength(text, choice)), 0);
}
function findCommonPrefixLength(text1: string, text2: string): number {
let i = 0;
while (i < text1.length && text1[i] === text2[i]) { ++i; }
return i;
}