gristlabs_grist-core/app/server/lib/uploads.ts

481 lines
19 KiB
TypeScript
Raw Normal View History

import {ApiError} from 'app/common/ApiError';
import {InactivityTimer} from 'app/common/InactivityTimer';
import {FetchUrlOptions, FileUploadResult, UPLOAD_URL_PATH, UploadResult} from 'app/common/uploads';
import {getDocWorkerUrl} from 'app/common/UserAPI';
import {getAuthorizedUserId, getTransitiveHeaders, getUserId, isSingleUserMode,
RequestWithLogin} from 'app/server/lib/Authorizer';
import {expressWrap} from 'app/server/lib/expressWrap';
import {downloadFromGDrive, isDriveUrl} from 'app/server/lib/GoogleImport';
(core) move more tests to grist-core Summary: * Tie build and run-time docker base images to a consistent version (buster) * Extend the test login system activated by GRIST_TEST_LOGIN to ease porting tests that currently rely on cognito (many) * Make org resets work in absence of billing endpoints * When in-memory session caches are used, add missing invalidation steps * Pass org information through sign-ups/sign-ins more carefully * For CORS, explicitly trust GRIST_HOST origin when set * Move some fixtures and tests to core, focussing on tests that cover existing failures or are in the set of tests run on deployments * Retain regular `test` target to run the test suite directly, without docker * Add a `test:smoke` target to run a single simple test without `GRIST_TEST_LOGIN` activated * Add a `test:docker` target to run the tests against a grist-core docker image - since tests rely on certain fixture teams/docs, added `TEST_SUPPORT_API_KEY` and `TEST_ADD_SAMPLES` flags to ease porting The tests ported were `nbrowser` tests: `ActionLog.ts` (the first test I tend to port to anything, out of habit), `Fork.ts` (exercises a lot of doc creation paths), `HomeIntro.ts` (a lot of DocMenu exercise), and `DuplicateDocument.ts` (covers a feature known to be failing prior to this diff, the CORS tweak resolves it). Test Plan: Manually tested via `buildtools/build_core.sh`. In follow up, I want to add running the `test:docker` target in grist-core's workflows. In jenkins, only the smoke test is run. There'd be an argument for running all tests, but they include particularly slow tests, and are duplicates of tests already run (in different configuration admittedly), so I'd like to try first just using them in grist-core to gate updates to any packaged version of Grist (the docker image currently). Reviewers: alexmojaki Reviewed By: alexmojaki Subscribers: alexmojaki Differential Revision: https://phab.getgrist.com/D3176
2021-12-10 22:42:54 +00:00
import {GristServer, RequestWithGrist} from 'app/server/lib/GristServer';
import {guessExt} from 'app/server/lib/guessExt';
import log from 'app/server/lib/log';
import {optStringParam} from 'app/server/lib/requestUtils';
import {isPathWithin} from 'app/server/lib/serverUtils';
import * as shutdown from 'app/server/lib/shutdown';
import {fromCallback} from 'bluebird';
import * as contentDisposition from 'content-disposition';
import {Application, Request, RequestHandler, Response} from 'express';
import * as fse from 'fs-extra';
import pick = require('lodash/pick');
import * as multiparty from 'multiparty';
import fetch, {Response as FetchResponse} from 'node-fetch';
import * as path from 'path';
import * as tmp from 'tmp';
// After some time of inactivity, clean up the upload. We give an hour, which seems generous,
// except that if one is toying with import options, and leaves the upload in an open browser idle
// for an hour, it will get cleaned up. TODO Address that; perhaps just with some UI messages.
const INACTIVITY_CLEANUP_MS = 60 * 60 * 1000; // an hour, very generously.
// A hook for dependency injection.
export const Deps = {fetch, INACTIVITY_CLEANUP_MS};
// An optional UploadResult, with parameters.
export interface FormResult {
upload?: UploadResult;
parameters?: {[key: string]: string};
}
/**
* Adds an upload route to the given express app, listening for POST requests at UPLOAD_URL_PATH.
*/
export function addUploadRoute(server: GristServer, expressApp: Application, ...handlers: RequestHandler[]): void {
// When doing a cross-origin post, the browser will check for access with options prior to posting.
// We need to reassure it that the request will be accepted before it will go ahead and post.
expressApp.options([`/${UPLOAD_URL_PATH}`, '/copy'], ...handlers, async (req, res) => {
// Origin is checked by middleware - if we get this far, we are ok.
res.status(200).send();
});
expressApp.post(`/${UPLOAD_URL_PATH}`, ...handlers, expressWrap(async (req: Request, res: Response) => {
try {
const uploadResult: UploadResult = await handleUpload(req, res);
res.status(200).send(JSON.stringify(uploadResult));
} catch (err) {
req.resume();
if (err.message && /Request aborted/.test(err.message)) {
log.warn("File upload request aborted", err);
} else {
log.error("Error uploading file", err);
}
// Respond with a JSON error like jsonErrorHandler does for API calls,
// to make it easier for the caller to parse it.
res.status(err.status || 500).json({error: err.message || 'internal error'});
}
}));
// Like upload, but copy data from a document already known to us.
expressApp.post(`/copy`, ...handlers, expressWrap(async (req: Request, res: Response) => {
const docId = optStringParam(req.query.doc, 'doc');
const name = optStringParam(req.query.name, 'name');
if (!docId) { throw new Error('doc must be specified'); }
const accessId = makeAccessId(req, getAuthorizedUserId(req));
try {
const uploadResult: UploadResult = await fetchDoc(server, docId, req, accessId,
req.query.template === '1');
if (name) {
globalUploadSet.changeUploadName(uploadResult.uploadId, accessId, name);
}
res.status(200).send(JSON.stringify(uploadResult));
} catch(err) {
if ((err as ApiError).status === 403) {
res.status(403).json({error:'Insufficient access to document to copy it entirely'});
return;
}
throw err;
}
}));
}
/**
* Create a FileUploadInfo for the given file.
*/
export async function getFileUploadInfo(filePath: string): Promise<FileUploadInfo> {
return {
absPath: filePath,
origName: path.basename(filePath),
size: (await fse.stat(filePath)).size,
ext: path.extname(filePath).toLowerCase(),
};
}
/**
* Implementation of the express /upload route.
*/
export async function handleUpload(req: Request, res: Response): Promise<UploadResult> {
const {upload} = await handleOptionalUpload(req, res);
if (!upload) { throw new ApiError('missing payload', 400); }
return upload;
}
/**
* Process form data that may contain an upload, returning that upload (if present)
* and any parameters.
*/
export async function handleOptionalUpload(req: Request, res: Response): Promise<FormResult> {
const {tmpDir, cleanupCallback} = await createTmpDir({});
const mreq = req as RequestWithLogin;
const meta = {
org: mreq.org,
email: mreq.user && mreq.user.loginEmail,
userId: mreq.userId,
altSessionId: mreq.altSessionId,
};
log.rawDebug(`Prepared to receive upload into tmp dir ${tmpDir}`, meta);
// Note that we don't limit upload sizes here, since this endpoint doesn't know what kind of
// upload it is, and some uploads are unlimited (e.g. uploading .grist files). Limits are
// checked in the client, and should be enforced on the server where an upload is processed.
const form = new multiparty.Form({uploadDir: tmpDir});
const [formFields, formFiles] = await fromCallback((cb: any) => form.parse(req, cb),
{multiArgs: true});
// 'upload' is the name of the form field containing file data.
let upload: UploadResult|undefined;
if (formFiles.upload) {
const uploadedFiles: FileUploadInfo[] = [];
for (const file of formFiles.upload) {
const mimeType = file.headers['content-type'];
log.rawDebug(`Received file ${file.originalFilename} (${file.size} bytes)`, meta);
uploadedFiles.push({
absPath: file.path,
origName: file.originalFilename,
size: file.size,
ext: await guessExt(file.path, file.originalFilename, mimeType),
});
}
const accessId = makeAccessId(req, getUserId(req));
const uploadId = globalUploadSet.registerUpload(uploadedFiles, tmpDir, cleanupCallback, accessId);
const files: FileUploadResult[] = uploadedFiles.map(f => pick(f, ['origName', 'size', 'ext']));
log.rawDebug(`Created uploadId ${uploadId} in tmp dir ${tmpDir}`, meta);
upload = {uploadId, files};
}
const parameters: {[key: string]: string} = {};
for (const key of Object.keys(formFields)) {
parameters[key] = formFields[key][0];
}
return {upload, parameters};
}
/**
* Represents a single uploaded file on the server side. Only the FileUploadResult part is exposed
* to the browser for information purposes.
*/
export interface FileUploadInfo extends FileUploadResult {
absPath: string; // Absolute path to the file on disk.
}
/**
* Represents a complete upload on the server side. It may be a temporary directory containing a
* list of files (not subdirectories), or a collection of non-temporary files. The
* cleanupCallback() is responsible for removing the temporary directory. It should be a no-op for
* non-temporary files.
*/
export interface UploadInfo {
uploadId: number; // ID of the upload
files: FileUploadInfo[]; // List of all files included in the upload.
tmpDir: string|null; // Temporary directory to remove, containing this upload.
// If present, all files must be direct children of this directory.
cleanupCallback: CleanupCB; // Callback to clean up this upload, including removing tmpDir.
cleanupTimer: InactivityTimer;
accessId: string|null; // Optional identifier for access control purposes.
}
type CleanupCB = () => void|Promise<void>;
export class UploadSet {
private _uploads: Map<number, UploadInfo> = new Map();
private _nextId: number = 0;
/**
* Register a new upload.
*/
public registerUpload(files: FileUploadInfo[], tmpDir: string|null, cleanupCallback: CleanupCB,
accessId: string|null): number {
const uploadId = this._nextId++;
const cleanupTimer = new InactivityTimer(() => this.cleanup(uploadId), Deps.INACTIVITY_CLEANUP_MS);
this._uploads.set(uploadId, {uploadId, files, tmpDir, cleanupCallback, cleanupTimer, accessId});
cleanupTimer.ping();
return uploadId;
}
/**
* Returns full info for the given uploadId, if authorized.
*/
public getUploadInfo(uploadId: number, accessId: string|null): UploadInfo {
const info = this._getUploadInfoWithoutAuthorization(uploadId);
if (info.accessId !== accessId) {
throw new ApiError('access denied', 403);
}
return info;
}
/**
* Clean up a particular upload.
*/
public async cleanup(uploadId: number): Promise<void> {
log.debug("UploadSet: cleaning up uploadId %s", uploadId);
const info = this._getUploadInfoWithoutAuthorization(uploadId);
info.cleanupTimer.disable();
this._uploads.delete(uploadId);
await info.cleanupCallback();
}
/**
* Clean up all uploads in this UploadSet. It may be used again after this call (it's called
* multiple times in tests).
*/
public async cleanupAll(): Promise<void> {
log.info("UploadSet: cleaning up all %d uploads in set", this._uploads.size);
const uploads = Array.from(this._uploads.values());
this._uploads.clear();
this._nextId = 0;
for (const info of uploads) {
try {
info.cleanupTimer.disable();
await info.cleanupCallback();
} catch (err) {
log.warn(`Error cleaning upload ${info.uploadId}: ${err}`);
}
}
}
/**
* Changes the name of an uploaded file. It is an error to use if the upload set has more than one
* file and it will throw.
*/
public changeUploadName(uploadId: number, accessId: string|null, name: string) {
const info = this.getUploadInfo(uploadId, accessId);
if (info.files.length > 1) {
throw new Error("UploadSet.changeUploadName cannot operate on multiple files");
}
info.files[0].origName = name;
}
/**
* Returns full info for the given uploadId, without checking authorization.
*/
private _getUploadInfoWithoutAuthorization(uploadId: number): UploadInfo {
const info = this._uploads.get(uploadId);
if (!info) { throw new ApiError(`Unknown upload ${uploadId}`, 404); }
// If the upload is being used, reschedule the inactivity timeout.
info.cleanupTimer.ping();
return info;
}
}
// Maintains uploads created on this host.
export const globalUploadSet: UploadSet = new UploadSet();
// Registers a handler to clean up on exit. We do this intentionally: even though module `tmp` has
// its own logic to clean up, that logic isn't triggered when the server is killed with a signal.
shutdown.addCleanupHandler(null, () => globalUploadSet.cleanupAll());
/**
* Moves this upload to a new directory. A new temporary subdirectory is created there first. If
* the upload contained temporary files, those are moved; if non-temporary files, those are
* copied. Aside from new file locations, the rest of the upload info stays unchanged.
*
* In any case, the previous cleanupCallback is run, and a new one created for the new tmpDir.
*
* This is used specifically for placing uploads into a location accessible by sandboxed code.
*/
export async function moveUpload(uploadInfo: UploadInfo, newDir: string): Promise<void> {
if (uploadInfo.tmpDir && isPathWithin(newDir, uploadInfo.tmpDir)) {
// Upload is already within newDir.
return;
}
log.debug("UploadSet: moving uploadId %s to %s", uploadInfo.uploadId, newDir);
const {tmpDir, cleanupCallback} = await createTmpDir({dir: newDir});
const move: boolean = Boolean(uploadInfo.tmpDir);
const files: FileUploadInfo[] = [];
for (const f of uploadInfo.files) {
const absPath = path.join(tmpDir, path.basename(f.absPath));
await (move ? fse.move(f.absPath, absPath) : fse.copy(f.absPath, absPath));
files.push({...f, absPath});
}
try {
await uploadInfo.cleanupCallback();
} catch (err) {
// This is unexpected, but if the move succeeded, let's warn but not fail on cleanup error.
log.warn(`Error cleaning upload ${uploadInfo.uploadId} after move: ${err}`);
}
Object.assign(uploadInfo, {files, tmpDir, cleanupCallback});
}
interface TmpDirResult {
tmpDir: string;
cleanupCallback: CleanupCB;
}
/**
* Helper to create a temporary directory. It's a simple wrapper around tmp.dir, but replaces the
* cleanup callback with an asynchronous version.
*/
export async function createTmpDir(options: tmp.Options): Promise<TmpDirResult> {
const fullOptions = {prefix: 'grist-upload-', unsafeCleanup: true, ...options};
const [tmpDir, tmpCleanup]: [string, CleanupCB] = await fromCallback(
(cb: any) => tmp.dir(fullOptions, cb), {multiArgs: true});
async function cleanupCallback() {
// Using fs-extra is better because it's asynchronous.
await fse.remove(tmpDir);
try {
// Still call the original callback, so that `tmp` module doesn't keep remembering about
// this directory and doesn't try to delete it again on exit.
await tmpCleanup();
} catch (err) {
// OK if it fails because the dir is already removed.
}
}
return {tmpDir, cleanupCallback};
}
/**
* Register a new upload with resource fetched from a public url. Returns corresponding UploadInfo.
*/
export async function fetchURL(url: string, accessId: string|null, options?: FetchUrlOptions): Promise<UploadResult> {
return _fetchURL(url, accessId, { fileName: path.basename(url), ...options});
}
/**
* Register a new upload with resource fetched from a url, optionally including credentials in request.
* Returns corresponding UploadInfo.
*/
async function _fetchURL(url: string, accessId: string|null, options?: FetchUrlOptions): Promise<UploadResult> {
try {
const code = options?.googleAuthorizationCode;
let fileName = options?.fileName ?? '';
const headers = options?.headers;
let response: FetchResponse;
if (isDriveUrl(url)) {
response = await downloadFromGDrive(url, code);
fileName = ''; // Read the file name from headers.
} else {
response = await Deps.fetch(url, {
redirect: 'follow',
follow: 10,
headers
});
}
await _checkForError(response);
if (fileName === '') {
const disposition = response.headers.get('content-disposition') || '';
fileName = contentDisposition.parse(disposition).parameters.filename || 'document.grist';
}
const mimeType = response.headers.get('content-type');
const {tmpDir, cleanupCallback} = await createTmpDir({});
// Any name will do for the single file in tmpDir, but note that fileName may not be valid.
const destPath = path.join(tmpDir, 'upload-content');
await new Promise((resolve, reject) => {
const dest = fse.createWriteStream(destPath, {autoClose: true});
response.body.on('error', reject);
dest.on('error', reject);
dest.on('finish', resolve);
response.body.pipe(dest);
});
const uploadedFile: FileUploadInfo = {
absPath: path.resolve(destPath),
origName: fileName,
size: (await fse.stat(destPath)).size,
ext: await guessExt(destPath, fileName, mimeType),
};
log.debug(`done fetching url: ${url} to ${destPath}`);
const uploadId = globalUploadSet.registerUpload([uploadedFile], tmpDir, cleanupCallback, accessId);
return {uploadId, files: [pick(uploadedFile, ['origName', 'size', 'ext'])]};
} catch(err) {
if (err?.code === "EPROTO" || // https vs http error
err?.code === "ECONNREFUSED" || // server does not listen
err?.code === "ENOTFOUND") { // could not resolve domain
throw new ApiError(`Can't connect to the server. The URL seems to be invalid. Error code ${err.code}`, 400);
}
throw err;
}
}
/**
* Fetches a Grist doc potentially managed by a different doc worker. Passes on credentials
* supplied in the current request.
*/
export async function fetchDoc(server: GristServer, docId: string, req: Request, accessId: string|null,
template: boolean): Promise<UploadResult> {
// Prepare headers that preserve credentials of current user.
const headers = getTransitiveHeaders(req);
// Passing the Origin header would serve no purpose here, as we are
// constructing an internal request to fetch from our own doc worker
// URL. Indeed, it may interfere, as it could incur a CORS check in
// `trustOrigin`, which we do not need.
delete headers.Origin;
// Find the doc worker responsible for the document we wish to copy.
// The backend needs to be well configured for this to work.
2024-03-25 15:01:48 +00:00
const homeUrl = server.getHomeInternalUrl(req);
const fetchUrl = new URL(`/api/worker/${docId}`, homeUrl);
const response: FetchResponse = await Deps.fetch(fetchUrl.href, {headers});
await _checkForError(response);
const docWorkerUrl = getDocWorkerUrl(server.getOwnUrl(), await response.json());
// Download the document, in full or as a template.
const url = new URL(`api/docs/${docId}/download?template=${Number(template)}`,
docWorkerUrl.replace(/\/*$/, '/'));
return _fetchURL(url.href, accessId, {headers});
}
// Re-issue failures as exceptions.
async function _checkForError(response: FetchResponse) {
if (response.status === 403) {
throw new ApiError("Access to this resource was denied.", response.status);
}
if (response.ok) {
const contentType = response.headers.get("content-type");
if (contentType?.startsWith("text/html")) {
// Probably we hit some login page
if (response.url.startsWith("https://accounts.google.com")) {
throw new ApiError("Importing directly from a Google Drive URL is not supported yet. " +
'Use the "Import from Google Drive" menu option instead.', 403);
} else {
throw new ApiError("Could not import the requested file, check if you have all required permissions.", 403);
}
}
return;
}
const body = await response.json().catch(() => ({}));
if (response.status === 404) {
throw new ApiError("File can't be found at the requested URL.", 404);
} else if (response.status >= 500 && response.status < 600) {
throw new ApiError(`Remote server returned an error (${body.error || response.statusText})`,
response.status, body.details);
} else {
throw new ApiError(body.error || response.statusText, response.status, body.details);
}
}
/**
* Create an access identifier, combining the userId supplied with the host of the
* doc worker. Returns null if userId is null or in standalone mode.
* Adding host information makes workers sharing a process more useful models of
* full-blown isolated workers.
*/
export function makeAccessId(worker: string|Request|GristServer, userId: number|null): string|null {
if (isSingleUserMode()) { return null; }
if (userId === null) { return null; }
let host: string;
if (typeof worker === 'string') {
host = worker;
} else if ('getHost' in worker) {
host = worker.getHost();
} else {
const gristServer = (worker as RequestWithGrist).gristServer;
if (!gristServer) { throw new Error('Problem accessing server with upload'); }
host = gristServer.getHost();
}
return `${userId}:${host}`;
}