mirror of
https://github.com/gristlabs/grist-core.git
synced 2024-10-27 20:44:07 +00:00
503 lines
18 KiB
TypeScript
503 lines
18 KiB
TypeScript
|
/**
|
||
|
* Module for serializing data in the format of Python 'marshal' module. It's used for
|
||
|
* communicating with the Python-based formula engine running in a Pypy sandbox. It supports
|
||
|
* version 0 of python marshalling format, which is what the Pypy sandbox supports.
|
||
|
*
|
||
|
* Usage:
|
||
|
* Marshalling:
|
||
|
* const marshaller = new Marshaller({version: 2});
|
||
|
* marshaller.marshal(value);
|
||
|
* marshaller.marshal(value);
|
||
|
* const buf = marshaller.dump(); // Leaves the marshaller empty.
|
||
|
*
|
||
|
* Unmarshalling:
|
||
|
* const unmarshaller = new Unmarshaller();
|
||
|
* unmarshaller.on('value', function(value) { ... });
|
||
|
* unmarshaller.push(buffer);
|
||
|
* unmarshaller.push(buffer);
|
||
|
*
|
||
|
* In Python, and in the marshalled format, there is a distinction between strings and unicode
|
||
|
* objects. In JS, there is a good correspondence to Uint8Array objects and strings, respectively.
|
||
|
* Python unicode objects always become JS strings. JS Uint8Arrays always become Python strings.
|
||
|
*
|
||
|
* JS strings become Python unicode objects, but can be marshalled to Python strings with
|
||
|
* 'stringToBuffer' option. Similarly, Python strings become JS Uint8Arrays, but can be
|
||
|
* unmarshalled to JS strings if 'bufferToString' option is set.
|
||
|
*/
|
||
|
import {BigInt} from 'app/common/BigInt';
|
||
|
import * as MemBuffer from 'app/common/MemBuffer';
|
||
|
import {EventEmitter} from 'events';
|
||
|
import * as util from 'util';
|
||
|
|
||
|
|
||
|
export interface MarshalOptions {
|
||
|
stringToBuffer?: boolean;
|
||
|
version?: number;
|
||
|
}
|
||
|
|
||
|
export interface UnmarshalOptions {
|
||
|
bufferToString?: boolean;
|
||
|
}
|
||
|
|
||
|
function ord(str: string): number {
|
||
|
return str.charCodeAt(0);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Type codes used for python marshalling of values.
|
||
|
* See pypy: rpython/translator/sandbox/_marshal.py.
|
||
|
*/
|
||
|
const marshalCodes = {
|
||
|
NULL : ord('0'),
|
||
|
NONE : ord('N'),
|
||
|
FALSE : ord('F'),
|
||
|
TRUE : ord('T'),
|
||
|
STOPITER : ord('S'),
|
||
|
ELLIPSIS : ord('.'),
|
||
|
INT : ord('i'),
|
||
|
INT64 : ord('I'),
|
||
|
/*
|
||
|
BFLOAT, for 'binary float', is an encoding of float that just encodes the bytes of the
|
||
|
double in standard IEEE 754 float64 format. It is used by Version 2+ of Python's marshal
|
||
|
module. Previously (in versions 0 and 1), the FLOAT encoding is used, which stores floats
|
||
|
through their string representations.
|
||
|
|
||
|
Version 0 (FLOAT) is mandatory for system calls within the sandbox, while Version 2 (BFLOAT)
|
||
|
is recommended for Grist's communication because it is more efficient and faster to
|
||
|
encode/decode
|
||
|
*/
|
||
|
BFLOAT : ord('g'),
|
||
|
FLOAT : ord('f'),
|
||
|
COMPLEX : ord('x'),
|
||
|
LONG : ord('l'),
|
||
|
STRING : ord('s'),
|
||
|
INTERNED : ord('t'),
|
||
|
STRINGREF: ord('R'),
|
||
|
TUPLE : ord('('),
|
||
|
LIST : ord('['),
|
||
|
DICT : ord('{'),
|
||
|
CODE : ord('c'),
|
||
|
UNICODE : ord('u'),
|
||
|
UNKNOWN : ord('?'),
|
||
|
SET : ord('<'),
|
||
|
FROZENSET: ord('>'),
|
||
|
};
|
||
|
|
||
|
type MarshalCode = keyof typeof marshalCodes;
|
||
|
|
||
|
// A little hack to test if the value is a 32-bit integer. Actually, for Python, int might be up
|
||
|
// to 64 bits (if that's the native size), but this is simpler.
|
||
|
// See http://stackoverflow.com/questions/3885817/how-to-check-if-a-number-is-float-or-integer.
|
||
|
function isInteger(n: number): boolean {
|
||
|
// Float have +0.0 and -0.0. To represent -0.0 precisely, we have to use a float, not an int
|
||
|
// (see also https://stackoverflow.com/questions/7223359/are-0-and-0-the-same).
|
||
|
// tslint:disable-next-line:no-bitwise
|
||
|
return n === +n && n === (n | 0) && !Object.is(n, -0.0);
|
||
|
}
|
||
|
|
||
|
// ----------------------------------------------------------------------
|
||
|
|
||
|
/**
|
||
|
* To force a value to be serialized using a particular representation (e.g. a number as INT64),
|
||
|
* wrap it into marshal.wrap('INT64', value) and serialize that.
|
||
|
*/
|
||
|
export function wrap(codeStr: MarshalCode, value: unknown) {
|
||
|
return new WrappedObj(marshalCodes[codeStr], value);
|
||
|
}
|
||
|
|
||
|
export class WrappedObj {
|
||
|
constructor(public code: number, public value: unknown) {}
|
||
|
|
||
|
public inspect() {
|
||
|
return util.inspect(this.value);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// ----------------------------------------------------------------------
|
||
|
|
||
|
/**
|
||
|
* @param {Boolean} options.stringToBuffer - If set, JS strings will become Python strings rather
|
||
|
* than unicode objects (as if each JS string is wrapped into MemBuffer.stringToArray(str)).
|
||
|
* This flag becomes a same-named property of Marshaller, which can be set at any time.
|
||
|
* @param {Number} options.version - If version >= 2, uses binary representation for floats. The
|
||
|
* default version 0 formats floats as strings.
|
||
|
*
|
||
|
* TODO: The default should be version 2. (0 was used historically because it was needed for
|
||
|
* communication with PyPy-based sandbox.)
|
||
|
*/
|
||
|
export class Marshaller {
|
||
|
private memBuf: MemBuffer;
|
||
|
private readonly floatCode: number;
|
||
|
private readonly stringCode: number;
|
||
|
|
||
|
constructor(options?: MarshalOptions) {
|
||
|
this.memBuf = new MemBuffer(undefined);
|
||
|
this.floatCode = options && options.version && options.version >= 2 ? marshalCodes.BFLOAT : marshalCodes.FLOAT;
|
||
|
this.stringCode = options && options.stringToBuffer ? marshalCodes.STRING : marshalCodes.UNICODE;
|
||
|
}
|
||
|
|
||
|
public dump(): Uint8Array {
|
||
|
// asByteArray returns a view on the underlying data, and the constructor creates a new copy.
|
||
|
// For some usages, we may want to avoid making the copy.
|
||
|
const bytes = new Uint8Array(this.memBuf.asByteArray());
|
||
|
this.memBuf.clear();
|
||
|
return bytes;
|
||
|
}
|
||
|
|
||
|
public dumpAsBuffer(): Buffer {
|
||
|
const bytes = Buffer.from(this.memBuf.asByteArray());
|
||
|
this.memBuf.clear();
|
||
|
return bytes;
|
||
|
}
|
||
|
|
||
|
public getCode(value: any) {
|
||
|
switch (typeof value) {
|
||
|
case 'number': return isInteger(value) ? marshalCodes.INT : this.floatCode;
|
||
|
case 'string': return this.stringCode;
|
||
|
case 'boolean': return value ? marshalCodes.TRUE : marshalCodes.FALSE;
|
||
|
case 'undefined': return marshalCodes.NONE;
|
||
|
case 'object': {
|
||
|
if (value instanceof WrappedObj) {
|
||
|
return value.code;
|
||
|
} else if (value === null) {
|
||
|
return marshalCodes.NONE;
|
||
|
} else if (value instanceof Uint8Array) {
|
||
|
return marshalCodes.STRING;
|
||
|
} else if (Buffer.isBuffer(value)) {
|
||
|
return marshalCodes.STRING;
|
||
|
} else if (Array.isArray(value)) {
|
||
|
return marshalCodes.LIST;
|
||
|
}
|
||
|
return marshalCodes.DICT;
|
||
|
}
|
||
|
default: {
|
||
|
throw new Error("Marshaller: Unsupported value of type " + (typeof value));
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
public marshal(value: any): void {
|
||
|
const code = this.getCode(value);
|
||
|
if (value instanceof WrappedObj) {
|
||
|
value = value.value;
|
||
|
}
|
||
|
this.memBuf.writeUint8(code);
|
||
|
switch (code) {
|
||
|
case marshalCodes.NULL: return;
|
||
|
case marshalCodes.NONE: return;
|
||
|
case marshalCodes.FALSE: return;
|
||
|
case marshalCodes.TRUE: return;
|
||
|
case marshalCodes.INT: return this.memBuf.writeInt32LE(value);
|
||
|
case marshalCodes.INT64: return this._writeInt64(value);
|
||
|
case marshalCodes.FLOAT: return this._writeStringFloat(value);
|
||
|
case marshalCodes.BFLOAT: return this.memBuf.writeFloat64LE(value);
|
||
|
case marshalCodes.STRING:
|
||
|
return (value instanceof Uint8Array || Buffer.isBuffer(value) ?
|
||
|
this._writeByteArray(value) :
|
||
|
this._writeUtf8String(value));
|
||
|
case marshalCodes.TUPLE: return this._writeList(value);
|
||
|
case marshalCodes.LIST: return this._writeList(value);
|
||
|
case marshalCodes.DICT: return this._writeDict(value);
|
||
|
case marshalCodes.UNICODE: return this._writeUtf8String(value);
|
||
|
// None of the following are supported.
|
||
|
case marshalCodes.STOPITER:
|
||
|
case marshalCodes.ELLIPSIS:
|
||
|
case marshalCodes.COMPLEX:
|
||
|
case marshalCodes.LONG:
|
||
|
case marshalCodes.INTERNED:
|
||
|
case marshalCodes.STRINGREF:
|
||
|
case marshalCodes.CODE:
|
||
|
case marshalCodes.UNKNOWN:
|
||
|
case marshalCodes.SET:
|
||
|
case marshalCodes.FROZENSET: throw new Error("Marshaller: Can't serialize code " + code);
|
||
|
default: throw new Error("Marshaller: Can't serialize code " + code);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
private _writeInt64(value: number) {
|
||
|
if (!isInteger(value)) {
|
||
|
// TODO We could actually support 53 bits or so.
|
||
|
throw new Error("Marshaller: int64 still only supports 32-bit ints for now: " + value);
|
||
|
}
|
||
|
this.memBuf.writeInt32LE(value);
|
||
|
this.memBuf.writeInt32LE(value >= 0 ? 0 : -1);
|
||
|
}
|
||
|
|
||
|
private _writeStringFloat(value: number) {
|
||
|
// This could be optimized a bit, but it's only used in V0 marshalling, which is only used in
|
||
|
// sandbox system calls, which don't really ever use floats anyway.
|
||
|
const bytes = MemBuffer.stringToArray(value.toString());
|
||
|
if (bytes.byteLength >= 127) {
|
||
|
throw new Error("Marshaller: Trying to write a float that takes " + bytes.byteLength + " bytes");
|
||
|
}
|
||
|
this.memBuf.writeUint8(bytes.byteLength);
|
||
|
this.memBuf.writeByteArray(bytes);
|
||
|
}
|
||
|
|
||
|
private _writeByteArray(value: Uint8Array|Buffer) {
|
||
|
// This works for both Uint8Arrays and Node Buffers.
|
||
|
this.memBuf.writeInt32LE(value.length);
|
||
|
this.memBuf.writeByteArray(value);
|
||
|
}
|
||
|
|
||
|
private _writeUtf8String(value: string) {
|
||
|
const offset = this.memBuf.size();
|
||
|
// We don't know the length until we write the value.
|
||
|
this.memBuf.writeInt32LE(0);
|
||
|
this.memBuf.writeString(value);
|
||
|
const byteLength = this.memBuf.size() - offset - 4;
|
||
|
// Overwrite the 0 length we wrote earlier with the correct byte length.
|
||
|
this.memBuf.asDataView.setInt32(this.memBuf.startPos + offset, byteLength, true);
|
||
|
}
|
||
|
|
||
|
private _writeList(array: unknown[]) {
|
||
|
this.memBuf.writeInt32LE(array.length);
|
||
|
for (const item of array) {
|
||
|
this.marshal(item);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
private _writeDict(obj: {[key: string]: any}) {
|
||
|
const keys = Object.keys(obj);
|
||
|
keys.sort();
|
||
|
for (const key of keys) {
|
||
|
this.marshal(key);
|
||
|
this.marshal(obj[key]);
|
||
|
}
|
||
|
this.memBuf.writeUint8(marshalCodes.NULL);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// ----------------------------------------------------------------------
|
||
|
|
||
|
const TwoTo32 = 0x100000000; // 2**32
|
||
|
const TwoTo15 = 0x8000; // 2**15
|
||
|
|
||
|
/**
|
||
|
* @param {Boolean} options.bufferToString - If set, Python strings will become JS strings rather
|
||
|
* than Buffers (as if each decoded buffer is wrapped into `buf.toString()`).
|
||
|
* This flag becomes a same-named property of Unmarshaller, which can be set at any time.
|
||
|
* Note that options.version isn't needed, since this will decode both formats.
|
||
|
* TODO: Integers (such as int64 and longs) that are too large for JS are currently represented as
|
||
|
* decimal strings. They may need a better representation, or a configurable option.
|
||
|
*/
|
||
|
export class Unmarshaller extends EventEmitter {
|
||
|
public memBuf: MemBuffer;
|
||
|
private consumer: any = null;
|
||
|
private _lastCode: number|null = null;
|
||
|
private readonly bufferToString: boolean;
|
||
|
private emitter: (v: any) => boolean;
|
||
|
private stringTable: Array<string|Uint8Array> = [];
|
||
|
|
||
|
constructor(options?: UnmarshalOptions) {
|
||
|
super();
|
||
|
this.memBuf = new MemBuffer(undefined);
|
||
|
this.bufferToString = Boolean(options && options.bufferToString);
|
||
|
this.emitter = this.emit.bind(this, 'value');
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Adds more data for parsing. Parsed values will be emitted as 'value' events.
|
||
|
* @param {Uint8Array|Buffer} byteArray: Uint8Array or Node Buffer with bytes to parse.
|
||
|
*/
|
||
|
public push(byteArray: Uint8Array|Buffer) {
|
||
|
this.parse(byteArray, this.emitter);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Adds data to parse, and calls valueCB(value) for each value parsed. If valueCB returns the
|
||
|
* Boolean false, stops parsing and returns.
|
||
|
*/
|
||
|
public parse(byteArray: Uint8Array|Buffer, valueCB: (val: any) => boolean|void) {
|
||
|
this.memBuf.writeByteArray(byteArray);
|
||
|
try {
|
||
|
while (this.memBuf.size() > 0) {
|
||
|
this.consumer = this.memBuf.makeConsumer();
|
||
|
|
||
|
// Have to reset stringTable for interned strings before each top-level parse call.
|
||
|
this.stringTable.length = 0;
|
||
|
|
||
|
const value = this._parse();
|
||
|
this.memBuf.consume(this.consumer);
|
||
|
if (valueCB(value) === false) {
|
||
|
return;
|
||
|
}
|
||
|
}
|
||
|
} catch (err) {
|
||
|
// If the error is `needMoreData`, we silently return. We'll retry by reparsing the message
|
||
|
// from scratch after the next push(). If buffers contain complete serialized messages, the
|
||
|
// cost should be minor. But this design might get very inefficient if we have big messages
|
||
|
// of arrays or dictionaries.
|
||
|
if (err.needMoreData) {
|
||
|
if (!err.consumedData || err.consumedData > 1024) {
|
||
|
// tslint:disable-next-line:no-console
|
||
|
console.log("Unmarshaller: Need more data; wasted parsing of %d bytes", err.consumedData);
|
||
|
}
|
||
|
} else {
|
||
|
err.message = "Unmarshaller: " + err.message;
|
||
|
throw err;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
private _parse(): unknown {
|
||
|
const code = this.memBuf.readUint8(this.consumer);
|
||
|
this._lastCode = code;
|
||
|
switch (code) {
|
||
|
case marshalCodes.NULL: return null;
|
||
|
case marshalCodes.NONE: return null;
|
||
|
case marshalCodes.FALSE: return false;
|
||
|
case marshalCodes.TRUE: return true;
|
||
|
case marshalCodes.INT: return this._parseInt();
|
||
|
case marshalCodes.INT64: return this._parseInt64();
|
||
|
case marshalCodes.FLOAT: return this._parseStringFloat();
|
||
|
case marshalCodes.BFLOAT: return this._parseBinaryFloat();
|
||
|
case marshalCodes.STRING: return this._parseByteString();
|
||
|
case marshalCodes.TUPLE: return this._parseList();
|
||
|
case marshalCodes.LIST: return this._parseList();
|
||
|
case marshalCodes.DICT: return this._parseDict();
|
||
|
case marshalCodes.UNICODE: return this._parseUnicode();
|
||
|
case marshalCodes.INTERNED: return this._parseInterned();
|
||
|
case marshalCodes.STRINGREF: return this._parseStringRef();
|
||
|
case marshalCodes.LONG: return this._parseLong();
|
||
|
// None of the following are supported.
|
||
|
// case marshalCodes.STOPITER:
|
||
|
// case marshalCodes.ELLIPSIS:
|
||
|
// case marshalCodes.COMPLEX:
|
||
|
// case marshalCodes.CODE:
|
||
|
// case marshalCodes.UNKNOWN:
|
||
|
// case marshalCodes.SET:
|
||
|
// case marshalCodes.FROZENSET:
|
||
|
default:
|
||
|
throw new Error(`Unmarshaller: unsupported code "${String.fromCharCode(code)}" (${code})`);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
private _parseInt() {
|
||
|
return this.memBuf.readInt32LE(this.consumer);
|
||
|
}
|
||
|
|
||
|
private _parseInt64() {
|
||
|
const low = this.memBuf.readInt32LE(this.consumer);
|
||
|
const hi = this.memBuf.readInt32LE(this.consumer);
|
||
|
if ((hi === 0 && low >= 0) || (hi === -1 && low < 0)) {
|
||
|
return low;
|
||
|
}
|
||
|
const unsignedLow = low < 0 ? TwoTo32 + low : low;
|
||
|
if (hi >= 0) {
|
||
|
return new BigInt(TwoTo32, [unsignedLow, hi], 1).toNative();
|
||
|
} else {
|
||
|
// This part is tricky. See unittests for check of correctness.
|
||
|
return new BigInt(TwoTo32, [TwoTo32 - unsignedLow, -hi - 1], -1).toNative();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
private _parseLong() {
|
||
|
// The format is a 32-bit size whose sign is the sign of the result, followed by 16-bit digits
|
||
|
// in base 2**15.
|
||
|
const size = this.memBuf.readInt32LE(this.consumer);
|
||
|
const sign = size < 0 ? -1 : 1;
|
||
|
const numDigits = size < 0 ? -size : size;
|
||
|
const digits = [];
|
||
|
for (let i = 0; i < numDigits; i++) {
|
||
|
digits.push(this.memBuf.readInt16LE(this.consumer));
|
||
|
}
|
||
|
return new BigInt(TwoTo15, digits, sign).toNative();
|
||
|
}
|
||
|
|
||
|
private _parseStringFloat() {
|
||
|
const len = this.memBuf.readUint8(this.consumer);
|
||
|
const buf = this.memBuf.readString(this.consumer, len);
|
||
|
return parseFloat(buf);
|
||
|
}
|
||
|
|
||
|
private _parseBinaryFloat() {
|
||
|
return this.memBuf.readFloat64LE(this.consumer);
|
||
|
}
|
||
|
|
||
|
private _parseByteString(): string|Uint8Array {
|
||
|
const len = this.memBuf.readInt32LE(this.consumer);
|
||
|
return (this.bufferToString ?
|
||
|
this.memBuf.readString(this.consumer, len) :
|
||
|
this.memBuf.readByteArray(this.consumer, len));
|
||
|
}
|
||
|
|
||
|
private _parseInterned() {
|
||
|
const s = this._parseByteString();
|
||
|
this.stringTable.push(s);
|
||
|
return s;
|
||
|
}
|
||
|
|
||
|
private _parseStringRef() {
|
||
|
const index = this._parseInt();
|
||
|
return this.stringTable[index];
|
||
|
}
|
||
|
|
||
|
private _parseList() {
|
||
|
const len = this.memBuf.readInt32LE(this.consumer);
|
||
|
const value = [];
|
||
|
for (let i = 0; i < len; i++) {
|
||
|
value[i] = this._parse();
|
||
|
}
|
||
|
return value;
|
||
|
}
|
||
|
|
||
|
private _parseDict() {
|
||
|
const dict: {[key: string]: any} = {};
|
||
|
while (true) { // eslint-disable-line no-constant-condition
|
||
|
let key = this._parse() as string|Uint8Array;
|
||
|
if (key === null && this._lastCode === marshalCodes.NULL) {
|
||
|
break;
|
||
|
}
|
||
|
const value = this._parse();
|
||
|
if (key !== null) {
|
||
|
if (key instanceof Uint8Array) {
|
||
|
key = MemBuffer.arrayToString(key);
|
||
|
}
|
||
|
dict[key as string] = value;
|
||
|
}
|
||
|
}
|
||
|
return dict;
|
||
|
}
|
||
|
|
||
|
private _parseUnicode() {
|
||
|
const len = this.memBuf.readInt32LE(this.consumer);
|
||
|
return this.memBuf.readString(this.consumer, len);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Similar to python's marshal.loads(). Parses the given bytes and returns the parsed value. There
|
||
|
* must not be any trailing data beyond the single marshalled value.
|
||
|
*/
|
||
|
export function loads(byteArray: Uint8Array|Buffer, options?: UnmarshalOptions): any {
|
||
|
const unmarshaller = new Unmarshaller(options);
|
||
|
let parsedValue;
|
||
|
unmarshaller.parse(byteArray, function(value) {
|
||
|
parsedValue = value;
|
||
|
return false;
|
||
|
});
|
||
|
if (typeof parsedValue === 'undefined') {
|
||
|
throw new Error("loads: input data truncated");
|
||
|
} else if (unmarshaller.memBuf.size() > 0) {
|
||
|
throw new Error("loads: extra bytes past end of input");
|
||
|
}
|
||
|
return parsedValue;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Serializes arbitrary data by first marshalling then converting to a base64 string.
|
||
|
*/
|
||
|
export function dumpBase64(data: any, options?: MarshalOptions) {
|
||
|
const marshaller = new Marshaller(options || {version: 2});
|
||
|
marshaller.marshal(data);
|
||
|
return marshaller.dumpAsBuffer().toString('base64');
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Loads data from a base64 string, as serialized by dumpBase64().
|
||
|
*/
|
||
|
export function loadBase64(data: string, options?: UnmarshalOptions) {
|
||
|
return loads(Buffer.from(data, 'base64'), options);
|
||
|
}
|