(core) Move file import plugins into core/sandbox/grist

Summary:
Move all the plugins python code into the main folder with the core code.

Register file importing functions in the same main.py entrypoint as the data engine.

Remove options relating to different entrypoints and code directories. The only remaining plugin-specific option in NSandbox is the import directory/mount, i.e. where files to be parsed are placed.

Test Plan: this

Reviewers: paulfitz

Reviewed By: paulfitz

Subscribers: dsagal

Differential Revision: https://phab.getgrist.com/D2965
pull/71/head
Alex Hall 3 years ago
parent 5b92a43849
commit 4d526da58f

@ -200,7 +200,7 @@ export class DocPluginManager {
if (components) {
const { safePython, unsafeNode } = components;
if (safePython) {
const comp = pluginInstance.safePython = new SafePythonComponent(plugin, safePython, this._tmpDir,
const comp = pluginInstance.safePython = new SafePythonComponent(plugin, this._tmpDir,
this._activeDoc.docName, this._server);
pluginInstance.rpc.registerForwarder(safePython, comp);
}

@ -31,27 +31,17 @@ type SandboxMethod = (...args: any[]) => any;
*
* Once python is running, ordinarily some Grist code should be
* started by setting `useGristEntrypoint` (the only exception is
* in tests).
*
* The Grist code that runs is by default grist/main.py. For plugins,
* this is overridden, to run whatever is specified by plugin.script.
*
* in tests) which runs grist/main.py.
*/
interface ISandboxOptions {
command?: string; // External program or container to call to run the sandbox.
args: string[]; // The arguments to pass to the python process.
// When doing imports, the sandbox is started somewhat differently.
// Directories are shared with the sandbox that are not otherwise.
// Options for that that are collected in `plugin`. TODO: update
// TODO: update
// ISandboxCreationOptions to talk about directories instead of
// mounts, since it may not be possible to remap directories as
// mounts (e.g. for unsandboxed operation).
plugin?: {
importDir: string; // a directory containing data file(s) to import.
pluginDir: string; // a directory containing code for running the import.
script: string; // an entrypoint, relative to pluginDir.
}
importDir?: string; // a directory containing data file(s) to import by plugins
docUrl?: string; // URL to the document, for SELF_HYPERLINK
minimalPipeMode?: boolean; // Whether to use newer 3-pipe operation
@ -397,14 +387,8 @@ export class NSandboxCreator implements ISandboxCreator {
logTimes: options.logTimes,
command: this._command,
useGristEntrypoint: true,
importDir: options.importMount,
};
if (options.entryPoint) {
translatedOptions.plugin = {
script: options.entryPoint,
pluginDir: options.sandboxMount || '',
importDir: options.importMount || '',
};
}
return new NSandbox(translatedOptions, spawners[this._flavor]);
}
}
@ -422,24 +406,20 @@ type SpawnFn = (options: ISandboxOptions) => ChildProcess;
* I've done my best to avoid changing behavior by not touching it too much.
*/
function pynbox(options: ISandboxOptions): ChildProcess {
const {command, args: pythonArgs, unsilenceLog, plugin} = options;
const {command, args: pythonArgs, unsilenceLog, importDir} = options;
if (command) {
throw new Error("NaCl can only run the specific python2.7 package built for it");
}
if (options.useGristEntrypoint) {
pythonArgs.unshift(plugin?.script || 'grist/main.pyc');
pythonArgs.unshift('grist/main.pyc');
}
const spawnOptions = {
stdio: ['pipe', 'pipe', 'pipe'] as 'pipe'[],
env: getWrappingEnv(options)
};
const wrapperArgs = new FlagBag({env: '-E', mount: '-m'});
if (plugin) {
// TODO: Only modules that we share with plugins should be mounted. They could be gathered in
// a "$APPROOT/sandbox/plugin" folder, only which get mounted.
wrapperArgs.addMount(`${plugin.pluginDir}:/sandbox:ro`);
wrapperArgs.addMount(`${plugin.importDir}:/importdir:ro`);
if (importDir) {
wrapperArgs.addMount(`${importDir}:/importdir:ro`);
}
if (!options.minimalPipeMode) {
@ -475,16 +455,16 @@ function pynbox(options: ISandboxOptions): ChildProcess {
* been installed globally.
*/
function unsandboxed(options: ISandboxOptions): ChildProcess {
const {args: pythonArgs, plugin} = options;
const {args: pythonArgs, importDir} = options;
const paths = getAbsolutePaths(options);
if (options.useGristEntrypoint) {
pythonArgs.unshift(paths.plugin?.script || paths.main);
pythonArgs.unshift(paths.main);
}
const spawnOptions = {
stdio: ['pipe', 'pipe', 'pipe'] as 'pipe'[],
env: {
PYTHONPATH: paths.engine,
IMPORTDIR: plugin?.importDir,
IMPORTDIR: importDir,
...getInsertedEnv(options),
...getWrappingEnv(options),
}
@ -531,12 +511,11 @@ function gvisor(options: ISandboxOptions): ChildProcess {
wrapperArgs.addEnv('PYTHONPATH', paths.engine);
wrapperArgs.addAllEnv(getInsertedEnv(options));
wrapperArgs.addMount(paths.sandboxDir);
if (paths.plugin) {
wrapperArgs.addMount(paths.plugin.pluginDir);
wrapperArgs.addMount(paths.plugin.importDir);
wrapperArgs.addEnv('IMPORTDIR', paths.plugin.importDir);
pythonArgs.unshift(paths.plugin.script);
} else if (options.useGristEntrypoint) {
if (paths.importDir) {
wrapperArgs.addMount(paths.importDir);
wrapperArgs.addEnv('IMPORTDIR', paths.importDir);
}
if (options.useGristEntrypoint) {
pythonArgs.unshift(paths.main);
}
if (options.deterministicMode) {
@ -558,17 +537,15 @@ function gvisor(options: ISandboxOptions): ChildProcess {
function docker(options: ISandboxOptions): ChildProcess {
const {args: pythonArgs, command} = options;
if (options.useGristEntrypoint) {
pythonArgs.unshift(options.plugin?.script || 'grist/main.py');
pythonArgs.unshift('grist/main.py');
}
if (!options.minimalPipeMode) {
throw new Error("docker only supports 3-pipe operation (although runc has --preserve-file-descriptors)");
}
const paths = getAbsolutePaths(options);
const plugin = paths.plugin;
const wrapperArgs = new FlagBag({env: '--env', mount: '-v'});
if (plugin) {
wrapperArgs.addMount(`${plugin.pluginDir}:/sandbox:ro`);
wrapperArgs.addMount(`${plugin.importDir}:/importdir:ro`);
if (paths.importDir) {
wrapperArgs.addMount(`${paths.importDir}:/importdir:ro`);
}
wrapperArgs.addMount(`${paths.engine}:/grist:ro`);
wrapperArgs.addAllEnv(getInsertedEnv(options));
@ -646,18 +623,12 @@ function getAbsolutePaths(options: ISandboxOptions) {
const sandboxDir = path.join(fs.realpathSync(path.join(process.cwd(), 'sandbox', 'grist')),
'..');
// Copy plugin options, and then make them absolute.
const plugin = options.plugin && { ...options.plugin };
if (plugin) {
plugin.pluginDir = fs.realpathSync(plugin.pluginDir);
plugin.importDir = fs.realpathSync(plugin.importDir);
// Plugin dir is ..../sandbox, and entry point is sandbox/...
// This may not be a general rule, it may be just for the "core" plugin, but
// that suffices for now.
plugin.script = path.join(plugin.pluginDir, '..', plugin.script);
if (options.importDir) {
options.importDir = fs.realpathSync(options.importDir);
}
return {
sandboxDir,
plugin,
importDir: options.importDir,
main: path.join(sandboxDir, 'grist/main.py'),
engine: path.join(sandboxDir, 'grist'),
};

@ -4,7 +4,6 @@ import {GristServer} from 'app/server/lib/GristServer';
import {ISandbox} from 'app/server/lib/ISandbox';
import * as log from 'app/server/lib/log';
import {IMsgCustom, IMsgRpcCall} from 'grain-rpc';
import * as path from 'path';
// TODO safePython component should be able to call other components function
// TODO calling a function on safePython component with a name that was not register chould fail
@ -22,10 +21,10 @@ export class SafePythonComponent extends BaseComponent {
// safe python component does not need pluginInstance.rpc because it is not possible to forward
// calls to other component from within python
constructor(private _localPlugin: LocalPlugin,
private _mainPath: string, private _tmpDir: string,
constructor(_localPlugin: LocalPlugin,
private _tmpDir: string,
docName: string, private _server: GristServer,
rpcLogger = createRpcLogger(log, `PLUGIN ${_localPlugin.id}/${_mainPath} SafePython:`)) {
rpcLogger = createRpcLogger(log, `PLUGIN ${_localPlugin.id} SafePython:`)) {
super(_localPlugin.manifest, rpcLogger);
this._logMeta = {plugin: _localPlugin.id, docId: docName};
}
@ -39,8 +38,6 @@ export class SafePythonComponent extends BaseComponent {
throw new Error("Sanbox should have a tmpDir");
}
this._sandbox = this._server.create.NSandbox({
entryPoint: this._mainPath,
sandboxMount: path.join(this._localPlugin.path, 'sandbox'),
importMount: this._tmpDir,
logTimes: true,
logMeta: this._logMeta,

@ -20,5 +20,4 @@ contributions:
scripts:
build:
# Note that ${XUNIT:+xxx} inserts "xxx" when XUNIT is set, and nothing otherwise.
test: $GRIST_PYTHON -m runtests discover -v -s /sandbox ${XUNIT:+--xunit}
test:

@ -1 +0,0 @@
__path__ = __import__('pkgutil').extend_path(__path__, __name__)

@ -1,184 +0,0 @@
from __future__ import absolute_import
import functools
from collections import namedtuple
from threading import RLock
_CacheInfo = namedtuple("CacheInfo", ["hits", "misses", "maxsize", "currsize"])
@functools.wraps(functools.update_wrapper)
def update_wrapper(wrapper,
wrapped,
assigned = functools.WRAPPER_ASSIGNMENTS,
updated = functools.WRAPPER_UPDATES):
"""
Patch two bugs in functools.update_wrapper.
"""
# workaround for http://bugs.python.org/issue3445
assigned = tuple(attr for attr in assigned if hasattr(wrapped, attr))
wrapper = functools.update_wrapper(wrapper, wrapped, assigned, updated)
# workaround for https://bugs.python.org/issue17482
wrapper.__wrapped__ = wrapped
return wrapper
class _HashedSeq(list):
__slots__ = 'hashvalue'
def __init__(self, tup, hash=hash):
self[:] = tup
self.hashvalue = hash(tup)
def __hash__(self):
return self.hashvalue
def _make_key(args, kwds, typed,
kwd_mark=(object(),),
fasttypes=set([int, str, frozenset, type(None)]),
sorted=sorted, tuple=tuple, type=type, len=len):
'Make a cache key from optionally typed positional and keyword arguments'
key = args
if kwds:
sorted_items = sorted(kwds.items())
key += kwd_mark
for item in sorted_items:
key += item
if typed:
key += tuple(type(v) for v in args)
if kwds:
key += tuple(type(v) for k, v in sorted_items)
elif len(key) == 1 and type(key[0]) in fasttypes:
return key[0]
return _HashedSeq(key)
def lru_cache(maxsize=100, typed=False):
"""Least-recently-used cache decorator.
If *maxsize* is set to None, the LRU features are disabled and the cache
can grow without bound.
If *typed* is True, arguments of different types will be cached separately.
For example, f(3.0) and f(3) will be treated as distinct calls with
distinct results.
Arguments to the cached function must be hashable.
View the cache statistics named tuple (hits, misses, maxsize, currsize) with
f.cache_info(). Clear the cache and statistics with f.cache_clear().
Access the underlying function with f.__wrapped__.
See: http://en.wikipedia.org/wiki/Cache_algorithms#Least_Recently_Used
"""
# Users should only access the lru_cache through its public API:
# cache_info, cache_clear, and f.__wrapped__
# The internals of the lru_cache are encapsulated for thread safety and
# to allow the implementation to change (including a possible C version).
def decorating_function(user_function):
cache = dict()
stats = [0, 0] # make statistics updateable non-locally
HITS, MISSES = 0, 1 # names for the stats fields
make_key = _make_key
cache_get = cache.get # bound method to lookup key or return None
_len = len # localize the global len() function
lock = RLock() # because linkedlist updates aren't threadsafe
root = [] # root of the circular doubly linked list
root[:] = [root, root, None, None] # initialize by pointing to self
nonlocal_root = [root] # make updateable non-locally
PREV, NEXT, KEY, RESULT = 0, 1, 2, 3 # names for the link fields
if maxsize == 0:
def wrapper(*args, **kwds):
# no caching, just do a statistics update after a successful call
result = user_function(*args, **kwds)
stats[MISSES] += 1
return result
elif maxsize is None:
def wrapper(*args, **kwds):
# simple caching without ordering or size limit
key = make_key(args, kwds, typed)
result = cache_get(key, root) # root used here as a unique not-found sentinel
if result is not root:
stats[HITS] += 1
return result
result = user_function(*args, **kwds)
cache[key] = result
stats[MISSES] += 1
return result
else:
def wrapper(*args, **kwds):
# size limited caching that tracks accesses by recency
key = make_key(args, kwds, typed) if kwds or typed else args
with lock:
link = cache_get(key)
if link is not None:
# record recent use of the key by moving it to the front of the list
root, = nonlocal_root
link_prev, link_next, key, result = link
link_prev[NEXT] = link_next
link_next[PREV] = link_prev
last = root[PREV]
last[NEXT] = root[PREV] = link
link[PREV] = last
link[NEXT] = root
stats[HITS] += 1
return result
result = user_function(*args, **kwds)
with lock:
root, = nonlocal_root
if key in cache:
# getting here means that this same key was added to the
# cache while the lock was released. since the link
# update is already done, we need only return the
# computed result and update the count of misses.
pass
elif _len(cache) >= maxsize:
# use the old root to store the new key and result
oldroot = root
oldroot[KEY] = key
oldroot[RESULT] = result
# empty the oldest link and make it the new root
root = nonlocal_root[0] = oldroot[NEXT]
oldkey = root[KEY]
root[KEY] = root[RESULT] = None
# now update the cache dictionary for the new links
del cache[oldkey]
cache[key] = oldroot
else:
# put result in a new link at the front of the list
last = root[PREV]
link = [last, root, key, result]
last[NEXT] = root[PREV] = cache[key] = link
stats[MISSES] += 1
return result
def cache_info():
"""Report cache statistics"""
with lock:
return _CacheInfo(stats[HITS], stats[MISSES], maxsize, len(cache))
def cache_clear():
"""Clear the cache and cache statistics"""
with lock:
cache.clear()
root = nonlocal_root[0]
root[:] = [root, root, None, None]
stats[:] = [0, 0]
wrapper.__wrapped__ = user_function
wrapper.cache_info = cache_info
wrapper.cache_clear = cache_clear
return update_wrapper(wrapper, user_function)
return decorating_function

@ -1,25 +0,0 @@
import logging
import sandbox
import import_csv
import import_xls
import import_json
def main():
s = logging.StreamHandler()
s.setFormatter(logging.Formatter(fmt='%(asctime)s.%(msecs)03d %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'))
rootLogger = logging.getLogger()
rootLogger.addHandler(s)
rootLogger.setLevel(logging.INFO)
# Todo: Grist should expose a register method accepting arguments as
# follow: register('csv_parser', 'canParse', can_parse)
sandbox.register("csv_parser.parseFile", import_csv.parse_file_source)
sandbox.register("xls_parser.parseFile", import_xls.import_file)
sandbox.register("json_parser.parseFile", import_json.parse_file)
sandbox.run()
if __name__ == "__main__":
main()

@ -10,7 +10,7 @@ import six
from six.moves import zip
import parse_data
import import_utils
from imports import import_utils
log = logging.getLogger(__name__)

@ -78,7 +78,7 @@ from itertools import count, chain
import six
import import_utils
from imports import import_utils
Ref = namedtuple('Ref', ['table_name', 'rowid'])
Row = namedtuple('Row', ['values', 'parent', 'ref'])

@ -2,19 +2,18 @@
This module reads a file path that is passed in using ActiveDoc.importFile()
and returns a object formatted so that it can be used by grist for a bulk add records action
"""
import os
import csv
import itertools
import logging
import os
import chardet
import messytables
import messytables.excel
import six
from six.moves import zip
import parse_data
import import_utils
from imports import import_utils
log = logging.getLogger(__name__)
@ -116,3 +115,36 @@ def parse_open_file(file_obj, orig_name, table_name_hint=None):
parse_options = {}
return parse_options, export_list
# This change was initially introduced in https://phab.getgrist.com/D2145
# Monkey-patching done in https://phab.getgrist.com/D2965
# to move towards normal dependency management
@staticmethod
def from_xlrdcell(xlrd_cell, sheet, col, row):
from messytables.excel import (
XLS_TYPES, StringType, DateType, InvalidDateError, xlrd, time, datetime, XLSCell
)
value = xlrd_cell.value
cell_type = XLS_TYPES.get(xlrd_cell.ctype, StringType())
if cell_type == DateType(None):
# Try-catch added by Dmitry, to avoid failing even if we see a date we can't handle.
try:
if value == 0:
raise InvalidDateError
year, month, day, hour, minute, second = \
xlrd.xldate_as_tuple(value, sheet.book.datemode)
if (year, month, day) == (0, 0, 0):
value = time(hour, minute, second)
else:
value = datetime(year, month, day, hour, minute, second)
except Exception:
# Keep going, and we'll just interpret the date as a number.
pass
messy_cell = XLSCell(value, type=cell_type)
messy_cell.sheet = sheet
messy_cell.xlrd_cell = xlrd_cell
messy_cell.xlrd_pos = (row, col) # necessary for properties, note not (x,y)
return messy_cell
messytables.excel.XLSCell.from_xlrdcell = from_xlrdcell

@ -1,60 +0,0 @@
"""This module loads a file_importer that implements the Grist import
API, and calls its selected method passing argument received from
PluginManager.sandboxImporter(). It returns an object formatted so
that it can be used by Grist.
"""
import sys
import argparse
import logging
import imp
import json
import marshal
log = logging.getLogger(__name__)
# Include /thirdparty into module search paths, in particular for messytables.
# pylint: disable=wrong-import-position
sys.path.append('/thirdparty')
def marshal_data(export_list):
return marshal.dumps(export_list, 2)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-d', '--debug', action='store_true',
help="Print debug instead of producing normal binary output")
parser.add_argument('-t', '--table',
help="Suggested table name to use with CSV imports")
parser.add_argument('-n', '--plugin-name', required=True,
help="Name of a python module implementing the import API.")
parser.add_argument('-p', '--plugin-path',
help="Location of the module.")
parser.add_argument('--action-options',
help="Options to pass to the action. See API documentation.")
parser.add_argument('action', help='Action to call',
choices=['can_parse', 'parse_file'])
parser.add_argument('input', help='File to convert')
args = parser.parse_args()
s = logging.StreamHandler()
s.setFormatter(logging.Formatter(fmt='%(asctime)s.%(msecs)03d %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'))
rootLogger = logging.getLogger()
rootLogger.addHandler(s)
rootLogger.setLevel(logging.DEBUG if args.debug else logging.INFO)
import_plugin = imp.load_compiled(
args.plugin_name,
args.plugin_path)
options = {}
if args.action_options:
options = json.loads(args.action_options)
parsed_data = getattr(import_plugin, args.action)(args.input, **options)
marshalled_data = marshal_data(parsed_data)
log.info("Marshalled data has %d bytes", len(marshalled_data))
if not args.debug:
sys.stdout.write(marshalled_data)
if __name__ == "__main__":
main()

@ -0,0 +1,18 @@
def register_import_parsers(sandbox):
def parse_csv(file_source, options):
from imports.import_csv import parse_file_source
return parse_file_source(file_source, options)
sandbox.register("csv_parser.parseFile", parse_csv)
def parse_excel(file_source, parse_options):
from imports.import_xls import import_file
return import_file(file_source, parse_options)
sandbox.register("xls_parser.parseFile", parse_excel)
def parse_json(file_source, parse_options):
from imports.import_json import parse_file
return parse_file(file_source, parse_options)
sandbox.register("json_parser.parseFile", parse_json)

@ -1,5 +1,5 @@
import unittest
from dateguess import guess, guess_bulk
from imports.dateguess import guess, guess_bulk
class TestGuesser(unittest.TestCase):

@ -1,12 +1,4 @@
# This Python file uses the following encoding: utf-8
# Run tests with:
#
# ./sandbox/nacl/bin/sel_ldr -E PYTHONPATH=/grist:/thirdparty -B ./sandbox/nacl/lib/irt_core.nexe -l /dev/null -m ./sandbox/nacl/root:/:ro -m ./plugins/core/sandbox:/sandbox:ro ./sandbox/nacl/lib/runnable-ld.so --library-path /slib /python/bin/python2.7.nexe -m unittest discover -v -s /sandbox #pylint: disable=line-too-long
#
#
# TODO: run test automatically
#
import math
import os
import textwrap
import unittest
@ -15,11 +7,11 @@ import csv
import calendar
import datetime
import import_csv
from imports import import_csv
def _get_fixture(filename):
return os.path.join(os.path.dirname(__file__), "test/fixtures", filename)
return os.path.join(os.path.dirname(__file__), "fixtures", filename)
def bytes_io_from_str(string):

@ -1,5 +1,5 @@
from unittest import TestCase
import import_json
from imports import import_json
class TestImportJSON(TestCase):

@ -5,10 +5,10 @@ import math
import os
import unittest
import import_xls
from imports import import_xls
def _get_fixture(filename):
return [os.path.join(os.path.dirname(__file__), "test/fixtures", filename), filename]
return [os.path.join(os.path.dirname(__file__), "fixtures", filename), filename]
class TestImportXLS(unittest.TestCase):

@ -11,14 +11,15 @@ import functools
import six
from acl_formula import parse_acl_formula
import actions
from sandbox import get_default_sandbox
import engine
import migrations
import schema
import useractions
import objtypes
from acl_formula import parse_acl_formula
from sandbox import get_default_sandbox
from imports.register import register_import_parsers
import logger
log = logger.Logger(__name__, logger.INFO)
@ -107,6 +108,8 @@ def run(sandbox):
export(eng.load_empty)
export(eng.load_done)
register_import_parsers(sandbox)
sandbox.run()
def main():

@ -7,7 +7,7 @@ dictionary with "type" and "data" fields, where "type" is a Grist type string, a
of values. All "data" lists will have the same length.
"""
import dateguess
from imports import dateguess
import datetime
import logging
import re
Loading…
Cancel
Save