gristlabs_grist-core/app/client/lib/ACIndex.ts

/**
 * A search index for auto-complete suggestions.
 *
 * This implementation indexes words, and suggests items based on a best-match score, including
 * amount of overlap and position of words. It searches case-insensitively and only at the start
 * of words. E.g. searching for "Blue" would match "Blu" in "Lavender Blush", but searching for
 * "lush" would only match the "L" in "Lavender".
 */

import {localeCompare, nativeCompare, sortedIndex} from 'app/common/gutil';
import {DomContents} from 'grainjs';

export interface ACItem {
  // This should be a trimmed lowercase version of the item's text. It may be an accessor.
  // Note that items with empty cleanText are never suggested.
  cleanText: string;
}

// Regexp used to split text into words; includes nearly all punctuation. This means that
// "foo-bar" may be searched by "bar", but it's impossible to search for punctuation itself (e.g.
// "a-b" and "a+b" are not distinguished). (It's easy to exclude unicode punctuation too if the
// need arises, see https://stackoverflow.com/a/25575009/328565).
const wordSepRegexp = /[\s!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~]+/;

/**
 * An auto-complete index, which simply allows searching for a string.
 */
export interface ACIndex<Item extends ACItem> {
  search(searchText: string): ACResults<Item>;
}

// Splits text into an array of pieces, with odd-indexed pieces being the ones to highlight.
export type HighlightFunc = (text: string) => string[];

export const highlightNone: HighlightFunc = (text) => [text];

/**
 * AutoComplete results include the suggested items, which one to highlight, and a function for
 * highlighting the matched portion of each item.
 */
export interface ACResults<Item extends ACItem> {
  // Matching items in order from best match to worst.
  items: Item[];

  // May be used to highlight matches using buildHighlightedDom().
  highlightFunc: HighlightFunc;

  // index of a good match (normally 0), or -1 if no great match
  selectIndex: number;
}

interface Word {
  word: string;     // The indexed word
  index: number;    // Index into _allItems for the item containing this word.
  pos: number;      // Position of the word within the item where it occurred.
}

/**
 * Implements a search index. It doesn't currently support updates; when any values change, the
 * index needs to be rebuilt from scratch.
 */
export class ACIndexImpl<Item extends ACItem> implements ACIndex<Item> {
  private _allItems: Item[];

  // All words from _allItems, sorted.
  private _words: Word[];

  // Creates an index for the given list of items.
  // The max number of items to suggest may be set using _maxResults (default is 50).
  constructor(items: Item[], private _maxResults: number = 50) {
    this._allItems = items.slice(0);

    // Collects [word, occurrence, position] tuples for all words in _allItems.
    const allWords: Word[] = [];
    for (let index = 0; index < this._allItems.length; index++) {
      const item = this._allItems[index];
      const words = item.cleanText.split(wordSepRegexp).filter(w => w);
      for (let pos = 0; pos < words.length; pos++) {
        allWords.push({word: words[pos], index, pos});
      }
    }

    allWords.sort((a, b) => localeCompare(a.word, b.word));
    this._words = allWords;
  }


  // The main search function. SearchText will be cleaned (trimmed and lowercased) at the start.
  // Empty search text returns the first N items in the search universe.
  public search(searchText: string): ACResults<Item> {
    const cleanedSearchText = searchText.trim().toLowerCase();
    const searchWords = cleanedSearchText.split(wordSepRegexp).filter(w => w);

    // Maps item index in _allItems to its score.
    const myMatches = new Map<number, number>();

    if (searchWords.length > 0) {
      // For each of searchWords, go through items with an overlap, and update their scores.
      for (let k = 0; k < searchWords.length; k++) {
        const searchWord = searchWords[k];
        for (const [itemIndex, score] of this._findOverlaps(searchWord, k)) {
          myMatches.set(itemIndex, (myMatches.get(itemIndex) || 0) + score);
        }
      }

      // Give an extra point to items that start with the searchText.
      for (const [itemIndex, score] of myMatches) {
        if (this._allItems[itemIndex].cleanText.startsWith(cleanedSearchText)) {
          myMatches.set(itemIndex, score + 1);
        }
      }
    }

    // Array of pairs [itemIndex, score], sorted by score (desc) and itemIndex.
    const sortedMatches = Array.from(myMatches)
      .sort((a, b) => nativeCompare(b[1], a[1]) || nativeCompare(a[0], b[0]))
      .slice(0, this._maxResults);

    const items: Item[] = sortedMatches.map(([index, score]) => this._allItems[index]);

    // Append enough non-matching items to reach maxResults.
    for (let i = 0; i < this._allItems.length && items.length < this._maxResults; i++) {
      if (this._allItems[i].cleanText && !myMatches.has(i)) {
        items.push(this._allItems[i]);
      }
    }

    if (!cleanedSearchText) {
      // In this case we are just returning the first few items.
      return {items, highlightFunc: highlightNone, selectIndex: -1};
    }

    const highlightFunc = highlightMatches.bind(null, searchWords);

    // The best match is the first item. If it actually starts with the search text, AND has a
    // strictly better score than other items, highlight it as a default selection. Otherwise, no
    // item will be auto-selected.
    let selectIndex = -1;
    if (items.length > 0 && items[0].cleanText.startsWith(cleanedSearchText) &&
        (sortedMatches.length <= 1 || sortedMatches[1][1] < sortedMatches[0][1])) {
      selectIndex = 0;
    }
    return {items, highlightFunc, selectIndex};
  }

  /**
   * Given one of the search words, looks it up in the indexed list of words and searches up and
   * down the list for all words that share a prefix with it. Each such word contributes something
   * to the score of the index entry it is a part of.
   *
   * Returns a Map from the index entry (index into _allItems) to the score which this searchWord
   * contributes to it.
   *
   * The searchWordPos argument is the position of searchWord in the overall search text (e.g. 0
   * if it's the first word). It is used for the position bonus, to give higher scores to entries
   * whose words occur in the same order as in the search text.
   */
  private _findOverlaps(searchWord: string, searchWordPos: number): Map<number, number> {
    const insertIndex = sortedIndex<{word: string}>(this._words, {word: searchWord},
      (a, b) => nativeCompare(a.word, b.word));

    // Maps index of item to its score.
    const scored = new Map<number, number>();

    // Search up and down the list, accepting smaller and smaller overlap.
    for (const step of [1, -1]) {
      let prefix = searchWord;
      let index = insertIndex + (step > 0 ? 0 : -1);
      while (prefix && index >= 0 && index < this._words.length) {
        for ( ; index >= 0 && index < this._words.length; index += step) {
          const wordEntry = this._words[index];
          // Once we reach a word that doesn't start with our prefix, break this loop, so we can
          // reduce the length of the prefix and keep scanning.
          if (!wordEntry.word.startsWith(prefix)) { break; }

          // The contribution of this word's to the score consists primarily of the length of
          // overlap (i.e. length for the current prefix).
          const baseScore = prefix.length;

          // To this we add 1 if the word matches exactly.
          const fullWordBonus = (wordEntry.word === searchWord ? 1 : 0);

          // To prefer matches where words occur in the same order as searched (e.g. searching for
          // "Foo B" should prefer "Foo Bar" over "Bar Foo"), we give a bonus based on the
          // position of the word in the search text and the entry text. (If positions match as
          // 0:0 and 1:1, the total position bonus is 2^0+2^(-2)=1.25; while the bonus from 0:1
          // and 1:0 would be 2^(-1) + 2^(-1)=1.0.)
          const positionBonus = Math.pow(2, -(searchWordPos + wordEntry.pos));

          const itemScore = baseScore + fullWordBonus + positionBonus;
          // Each search word contributes only one score (e.g. a search for "Foo" will partially
          // match both words in "forty five", but only the higher of the matches will count).
          if (itemScore >= (scored.get(wordEntry.index) || 0)) {
            scored.set(wordEntry.index, itemScore);
          }
        }
        prefix = prefix.slice(0, -1);
      }
    }
    return scored;
  }
}


export type BuildHighlightFunc = (match: string) => DomContents;

/**
 * Converts text to DOM with matching bits of text rendered using highlight(match) function.
 */
export function buildHighlightedDom(
  text: string, highlightFunc: HighlightFunc, highlight: BuildHighlightFunc
): DomContents {
  if (!text) { return text; }
  const parts = highlightFunc(text);
  return parts.map((part, k) => k % 2 ? highlight(part) : part);
}


// Same as wordSepRegexp, but with capturing parentheses.
const wordSepRegexpParen = new RegExp(`(${wordSepRegexp.source})`);

/**
 * Splits text into pieces, with odd-numbered pieces the ones matching a prefix of some
 * searchWord, i.e. the ones to highlight.
 */
function highlightMatches(searchWords: string[], text: string): string[] {
  const textParts = text.split(wordSepRegexpParen);
  const outputs = [''];
  for (let i = 0; i < textParts.length; i += 2) {
    const word = textParts[i];
    const separator = textParts[i + 1] || '';
    const prefixLen = findLongestPrefixLen(word.toLowerCase(), searchWords);
    if (prefixLen === 0) {
      outputs[outputs.length - 1] += word + separator;
    } else {
      outputs.push(word.slice(0, prefixLen), word.slice(prefixLen) + separator);
    }
  }
  return outputs;
}

function findLongestPrefixLen(text: string, choices: string[]): number {
  return choices.reduce((max, choice) => Math.max(max, findCommonPrefixLength(text, choice)), 0);
}

function findCommonPrefixLength(text1: string, text2: string): number {
  let i = 0;
  while (i < text1.length && text1[i] === text2[i]) { ++i; }
  return i;
}
(core) move client code to core Summary: This moves all client code to core, and makes minimal fix-ups to get grist and grist-core to compile correctly. The client works in core, but I'm leaving clean-up around the build and bundles to follow-up. Test Plan: existing tests pass; server-dev bundle looks sane Reviewers: dsagal Reviewed By: dsagal Differential Revision: https://phab.getgrist.com/D2627 2020-10-02 15:10:00 +00:00			`/**`
			`* A search index for auto-complete suggestions.`
			`*`
			`* This implementation indexes words, and suggests items based on a best-match score, including`
			`* amount of overlap and position of words. It searches case-insensitively and only at the start`
			`* of words. E.g. searching for "Blue" would match "Blu" in "Lavender Blush", but searching for`
			`* "lush" would only match the "L" in "Lavender".`
			`*/`

(core) Use unicode-aware comparisons for user-visible strings. Summary: - Switch code that compares user strings to use localeCompare() based on Intl.Collator. - Use en-US locale for now. (Ideally should be a document property.) - Note that with this change, sorting is also becoming case-insensitive (which seems an improvement) - Updated a sorted test fixture - Updated a browser test with lots of unicode to expect different order. - Added a bit of unicode to test ordering in Reference autocomplete dropdown. Test Plan: Fixed / updated tests Reviewers: paulfitz Reviewed By: paulfitz Differential Revision: https://phab.getgrist.com/D2758 2021-03-13 02:25:44 +00:00			`import {localeCompare, nativeCompare, sortedIndex} from 'app/common/gutil';`
(core) move client code to core Summary: This moves all client code to core, and makes minimal fix-ups to get grist and grist-core to compile correctly. The client works in core, but I'm leaving clean-up around the build and bundles to follow-up. Test Plan: existing tests pass; server-dev bundle looks sane Reviewers: dsagal Reviewed By: dsagal Differential Revision: https://phab.getgrist.com/D2627 2020-10-02 15:10:00 +00:00			`import {DomContents} from 'grainjs';`

			`export interface ACItem {`
			`// This should be a trimmed lowercase version of the item's text. It may be an accessor.`
			`// Note that items with empty cleanText are never suggested.`
			`cleanText: string;`
			`}`

			`// Regexp used to split text into words; includes nearly all punctuation. This means that`
			`// "foo-bar" may be searched by "bar", but it's impossible to search for punctuation itself (e.g.`
			`// "a-b" and "a+b" are not distinguished). (It's easy to exclude unicode punctuation too if the`
			`// need arises, see https://stackoverflow.com/a/25575009/328565).`
			const wordSepRegexp = /[\s!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{\|}~]+/;

			`/**`
			`* An auto-complete index, which simply allows searching for a string.`
			`*/`
			`export interface ACIndex<Item extends ACItem> {`
			`search(searchText: string): ACResults<Item>;`
			`}`

			`// Splits text into an array of pieces, with odd-indexed pieces being the ones to highlight.`
			`export type HighlightFunc = (text: string) => string[];`

			`export const highlightNone: HighlightFunc = (text) => [text];`

			`/**`
			`* AutoComplete results include the suggested items, which one to highlight, and a function for`
			`* highlighting the matched portion of each item.`
			`*/`
			`export interface ACResults<Item extends ACItem> {`
			`// Matching items in order from best match to worst.`
			`items: Item[];`

			`// May be used to highlight matches using buildHighlightedDom().`
			`highlightFunc: HighlightFunc;`

			`// index of a good match (normally 0), or -1 if no great match`
			`selectIndex: number;`
			`}`

			`interface Word {`
			`word: string; // The indexed word`
			`index: number; // Index into _allItems for the item containing this word.`
			`pos: number; // Position of the word within the item where it occurred.`
			`}`

			`/**`
			`* Implements a search index. It doesn't currently support updates; when any values change, the`
			`* index needs to be rebuilt from scratch.`
			`*/`
			`export class ACIndexImpl<Item extends ACItem> implements ACIndex<Item> {`
			`private _allItems: Item[];`

			`// All words from _allItems, sorted.`
			`private _words: Word[];`

			`// Creates an index for the given list of items.`
			`// The max number of items to suggest may be set using _maxResults (default is 50).`
			`constructor(items: Item[], private _maxResults: number = 50) {`
			`this._allItems = items.slice(0);`

			`// Collects [word, occurrence, position] tuples for all words in _allItems.`
			`const allWords: Word[] = [];`
			`for (let index = 0; index < this._allItems.length; index++) {`
			`const item = this._allItems[index];`
			`const words = item.cleanText.split(wordSepRegexp).filter(w => w);`
			`for (let pos = 0; pos < words.length; pos++) {`
			`allWords.push({word: words[pos], index, pos});`
			`}`
			`}`

(core) Use unicode-aware comparisons for user-visible strings. Summary: - Switch code that compares user strings to use localeCompare() based on Intl.Collator. - Use en-US locale for now. (Ideally should be a document property.) - Note that with this change, sorting is also becoming case-insensitive (which seems an improvement) - Updated a sorted test fixture - Updated a browser test with lots of unicode to expect different order. - Added a bit of unicode to test ordering in Reference autocomplete dropdown. Test Plan: Fixed / updated tests Reviewers: paulfitz Reviewed By: paulfitz Differential Revision: https://phab.getgrist.com/D2758 2021-03-13 02:25:44 +00:00			`allWords.sort((a, b) => localeCompare(a.word, b.word));`
(core) move client code to core Summary: This moves all client code to core, and makes minimal fix-ups to get grist and grist-core to compile correctly. The client works in core, but I'm leaving clean-up around the build and bundles to follow-up. Test Plan: existing tests pass; server-dev bundle looks sane Reviewers: dsagal Reviewed By: dsagal Differential Revision: https://phab.getgrist.com/D2627 2020-10-02 15:10:00 +00:00			`this._words = allWords;`
			`}`


			`// The main search function. SearchText will be cleaned (trimmed and lowercased) at the start.`
			`// Empty search text returns the first N items in the search universe.`
			`public search(searchText: string): ACResults<Item> {`
			`const cleanedSearchText = searchText.trim().toLowerCase();`
			`const searchWords = cleanedSearchText.split(wordSepRegexp).filter(w => w);`

			`// Maps item index in _allItems to its score.`
			`const myMatches = new Map<number, number>();`

			`if (searchWords.length > 0) {`
			`// For each of searchWords, go through items with an overlap, and update their scores.`
			`for (let k = 0; k < searchWords.length; k++) {`
			`const searchWord = searchWords[k];`
			`for (const [itemIndex, score] of this._findOverlaps(searchWord, k)) {`
			`myMatches.set(itemIndex, (myMatches.get(itemIndex) \|\| 0) + score);`
			`}`
			`}`

			`// Give an extra point to items that start with the searchText.`
			`for (const [itemIndex, score] of myMatches) {`
			`if (this._allItems[itemIndex].cleanText.startsWith(cleanedSearchText)) {`
			`myMatches.set(itemIndex, score + 1);`
			`}`
			`}`
			`}`

			`// Array of pairs [itemIndex, score], sorted by score (desc) and itemIndex.`
			`const sortedMatches = Array.from(myMatches)`
			`.sort((a, b) => nativeCompare(b[1], a[1]) \|\| nativeCompare(a[0], b[0]))`
			`.slice(0, this._maxResults);`

			`const items: Item[] = sortedMatches.map(([index, score]) => this._allItems[index]);`

			`// Append enough non-matching items to reach maxResults.`
			`for (let i = 0; i < this._allItems.length && items.length < this._maxResults; i++) {`
			`if (this._allItems[i].cleanText && !myMatches.has(i)) {`
			`items.push(this._allItems[i]);`
			`}`
			`}`

			`if (!cleanedSearchText) {`
			`// In this case we are just returning the first few items.`
			`return {items, highlightFunc: highlightNone, selectIndex: -1};`
			`}`

			`const highlightFunc = highlightMatches.bind(null, searchWords);`

			`// The best match is the first item. If it actually starts with the search text, AND has a`
			`// strictly better score than other items, highlight it as a default selection. Otherwise, no`
			`// item will be auto-selected.`
			`let selectIndex = -1;`
			`if (items.length > 0 && items[0].cleanText.startsWith(cleanedSearchText) &&`
			`(sortedMatches.length <= 1 \|\| sortedMatches[1][1] < sortedMatches[0][1])) {`
			`selectIndex = 0;`
			`}`
			`return {items, highlightFunc, selectIndex};`
			`}`

			`/**`
			`* Given one of the search words, looks it up in the indexed list of words and searches up and`
			`* down the list for all words that share a prefix with it. Each such word contributes something`
			`* to the score of the index entry it is a part of.`
			`*`
			`* Returns a Map from the index entry (index into _allItems) to the score which this searchWord`
			`* contributes to it.`
			`*`
			`* The searchWordPos argument is the position of searchWord in the overall search text (e.g. 0`
			`* if it's the first word). It is used for the position bonus, to give higher scores to entries`
			`* whose words occur in the same order as in the search text.`
			`*/`
			`private _findOverlaps(searchWord: string, searchWordPos: number): Map<number, number> {`
			`const insertIndex = sortedIndex<{word: string}>(this._words, {word: searchWord},`
			`(a, b) => nativeCompare(a.word, b.word));`

			`// Maps index of item to its score.`
			`const scored = new Map<number, number>();`

			`// Search up and down the list, accepting smaller and smaller overlap.`
			`for (const step of [1, -1]) {`
			`let prefix = searchWord;`
			`let index = insertIndex + (step > 0 ? 0 : -1);`
			`while (prefix && index >= 0 && index < this._words.length) {`
			`for ( ; index >= 0 && index < this._words.length; index += step) {`
			`const wordEntry = this._words[index];`
			`// Once we reach a word that doesn't start with our prefix, break this loop, so we can`
			`// reduce the length of the prefix and keep scanning.`
			`if (!wordEntry.word.startsWith(prefix)) { break; }`

			`// The contribution of this word's to the score consists primarily of the length of`
			`// overlap (i.e. length for the current prefix).`
			`const baseScore = prefix.length;`

			`// To this we add 1 if the word matches exactly.`
			`const fullWordBonus = (wordEntry.word === searchWord ? 1 : 0);`

			`// To prefer matches where words occur in the same order as searched (e.g. searching for`
			`// "Foo B" should prefer "Foo Bar" over "Bar Foo"), we give a bonus based on the`
			`// position of the word in the search text and the entry text. (If positions match as`
			`// 0:0 and 1:1, the total position bonus is 2^0+2^(-2)=1.25; while the bonus from 0:1`
			`// and 1:0 would be 2^(-1) + 2^(-1)=1.0.)`
			`const positionBonus = Math.pow(2, -(searchWordPos + wordEntry.pos));`

			`const itemScore = baseScore + fullWordBonus + positionBonus;`
			`// Each search word contributes only one score (e.g. a search for "Foo" will partially`
			`// match both words in "forty five", but only the higher of the matches will count).`
			`if (itemScore >= (scored.get(wordEntry.index) \|\| 0)) {`
			`scored.set(wordEntry.index, itemScore);`
			`}`
			`}`
			`prefix = prefix.slice(0, -1);`
			`}`
			`}`
			`return scored;`
			`}`
			`}`


			`export type BuildHighlightFunc = (match: string) => DomContents;`

			`/**`
			`* Converts text to DOM with matching bits of text rendered using highlight(match) function.`
			`*/`
			`export function buildHighlightedDom(`
			`text: string, highlightFunc: HighlightFunc, highlight: BuildHighlightFunc`
			`): DomContents {`
			`if (!text) { return text; }`
			`const parts = highlightFunc(text);`
			`return parts.map((part, k) => k % 2 ? highlight(part) : part);`
			`}`


			`// Same as wordSepRegexp, but with capturing parentheses.`
			const wordSepRegexpParen = new RegExp(`(${wordSepRegexp.source})`);

			`/**`
			`* Splits text into pieces, with odd-numbered pieces the ones matching a prefix of some`
			`* searchWord, i.e. the ones to highlight.`
			`*/`
			`function highlightMatches(searchWords: string[], text: string): string[] {`
			`const textParts = text.split(wordSepRegexpParen);`
			`const outputs = [''];`
			`for (let i = 0; i < textParts.length; i += 2) {`
			`const word = textParts[i];`
			`const separator = textParts[i + 1] \|\| '';`
			`const prefixLen = findLongestPrefixLen(word.toLowerCase(), searchWords);`
			`if (prefixLen === 0) {`
			`outputs[outputs.length - 1] += word + separator;`
			`} else {`
			`outputs.push(word.slice(0, prefixLen), word.slice(prefixLen) + separator);`
			`}`
			`}`
			`return outputs;`
			`}`

			`function findLongestPrefixLen(text: string, choices: string[]): number {`
			`return choices.reduce((max, choice) => Math.max(max, findCommonPrefixLength(text, choice)), 0);`
			`}`

			`function findCommonPrefixLength(text1: string, text2: string): number {`
			`let i = 0;`
			`while (i < text1.length && text1[i] === text2[i]) { ++i; }`
			`return i;`
			`}`