From 1133dbac337979ae26606f221151c6474b301de7 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Mon, 28 Oct 2019 21:22:59 +0100 Subject: [PATCH 1/2] Make the `ObjectLoader` use more efficient methods when determining if data needs to be loaded Currently, for data in `ChunkedStream` instances, the `getMissingChunks` method is used in a couple of places to determine if data is already available or if it needs to be loaded. When looking at how `ChunkedStream.getMissingChunks` is being used in the `ObjectLoader` you'll notice that we don't actually care about which *specific* chunks are missing, but rather only want essentially a yes/no answer to the "Is the data available?" question. Furthermore, when looking at how `ChunkedStream.getMissingChunks` itself is implemented you'll notice that it (somewhat expectedly) always iterates over *all* chunks. All in all, using `ChunkedStream.getMissingChunks` in the `ObjectLoader` seems like an unnecessary "heavy" and roundabout way to obtain a boolean value. However, it turns out there already exists a `ChunkedStream.allChunksLoaded` method, consisting of a *single* simple check, which seems like a perfect fit for the `ObjectLoader` use cases. In particular, once the *entire* PDF document has been loaded (which is usually fairly quick with streaming enabled), you'd really want the `ObjectLoader` to be as simple/quick as possible (similar to e.g. loading a local files) which this patch should help with. Note that I wouldn't expect this patch to have a huge effect on performance, but it will nonetheless save some CPU/memory resources when the `ObjectLoader` is used. (As usual this should help larger PDF documents, w.r.t. both file size and number of pages, the most.) --- src/core/chunked_stream.js | 6 ++++++ src/core/obj.js | 17 ++++++++--------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/core/chunked_stream.js b/src/core/chunked_stream.js index cd2dadff7203d..f58aa53bd100e 100644 --- a/src/core/chunked_stream.js +++ b/src/core/chunked_stream.js @@ -285,6 +285,12 @@ class ChunkedStream { } return missingChunks; }; + ChunkedStreamSubstream.prototype.allChunksLoaded = function() { + if (this.numChunksLoaded === this.numChunks) { + return true; + } + return this.getMissingChunks().length === 0; + }; const subStream = new ChunkedStreamSubstream(); subStream.pos = subStream.start = start; diff --git a/src/core/obj.js b/src/core/obj.js index 1f3053db08688..818e65d52c710 100644 --- a/src/core/obj.js +++ b/src/core/obj.js @@ -27,7 +27,6 @@ import { Lexer, Parser } from './parser'; import { MissingDataException, toRomanNumerals, XRefEntryException, XRefParseException } from './core_utils'; -import { ChunkedStream } from './chunked_stream'; import { CipherTransformFactory } from './crypto'; import { ColorSpace } from './colorspace'; @@ -2072,14 +2071,14 @@ let ObjectLoader = (function() { } ObjectLoader.prototype = { - load() { - this.capability = createPromiseCapability(); - // Don't walk the graph if all the data is already loaded. - if (!(this.xref.stream instanceof ChunkedStream) || - this.xref.stream.getMissingChunks().length === 0) { - this.capability.resolve(); - return this.capability.promise; + async load() { + // Don't walk the graph if all the data is already loaded; note that only + // `ChunkedStream` instances have a `allChunksLoaded` method. + if (!this.xref.stream.allChunksLoaded || + this.xref.stream.allChunksLoaded()) { + return undefined; } + this.capability = createPromiseCapability(); let { keys, dict, } = this; this.refSet = new RefSet(); @@ -2126,7 +2125,7 @@ let ObjectLoader = (function() { let foundMissingData = false; for (let i = 0, ii = baseStreams.length; i < ii; i++) { let stream = baseStreams[i]; - if (stream.getMissingChunks && stream.getMissingChunks().length) { + if (stream.allChunksLoaded && !stream.allChunksLoaded()) { foundMissingData = true; pendingRequests.push({ begin: stream.start, end: stream.end, }); } From 2d35a49dd8223cb2f5bdf1e46ba9b88a9d0a268f Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Tue, 29 Oct 2019 18:12:35 +0100 Subject: [PATCH 2/2] Inline a couple of `isRef`/`isDict` checks in the `ObjectLoader` code As we've seen in numerous other cases, avoiding unnecessary function calls is never a bad thing (even if the effect is probably tiny here). --- src/core/obj.js | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/core/obj.js b/src/core/obj.js index 818e65d52c710..dd844b6dab84f 100644 --- a/src/core/obj.js +++ b/src/core/obj.js @@ -2038,13 +2038,13 @@ var FileSpec = (function FileSpecClosure() { */ let ObjectLoader = (function() { function mayHaveChildren(value) { - return isRef(value) || isDict(value) || Array.isArray(value) || - isStream(value); + return (value instanceof Ref) || (value instanceof Dict) || + Array.isArray(value) || isStream(value); } function addChildren(node, nodesToVisit) { - if (isDict(node) || isStream(node)) { - let dict = isDict(node) ? node : node.dict; + if ((node instanceof Dict) || isStream(node)) { + let dict = (node instanceof Dict) ? node : node.dict; let dictKeys = dict.getKeys(); for (let i = 0, ii = dictKeys.length; i < ii; i++) { let rawValue = dict.getRaw(dictKeys[i]); @@ -2104,7 +2104,7 @@ let ObjectLoader = (function() { let currentNode = nodesToVisit.pop(); // Only references or chunked streams can cause missing data exceptions. - if (isRef(currentNode)) { + if (currentNode instanceof Ref) { // Skip nodes that have already been visited. if (this.refSet.has(currentNode)) { continue; @@ -2144,7 +2144,7 @@ let ObjectLoader = (function() { let node = nodesToRevisit[i]; // Remove any reference nodes from the current `RefSet` so they // aren't skipped when we revist them. - if (isRef(node)) { + if (node instanceof Ref) { this.refSet.remove(node); } }