Skip to content

Commit d7949dc

Browse files
samuelhuanggijsk
authored andcommitted
Improve paragraph wrapping and DOM implementation
This change improves the logic for wrapping phrasing content in paragraphs, fixing two bugs: 1. Trailing whitespace was not always trimmed correctly from phrasing content at the end of a container &lt;div&gt;. 2. Leading whitespace nodes could be left behind as direct children of the <div> instead of being discarded. The logic in Readability.js has been refactored to use a more robust "collect and transform" pattern. It now uses a DocumentFragment to gather all consecutive phrasing content, correctly trims leading / trailing whitespace, and then wraps the non-empty result in a <p> tag. This produces cleaner HTML, as reflected in the updated test pages. To support this, several enhancements were made to the JSDOMParser.js DOM implementation: * Added support for DocumentFragment, including doc.createDocumentFragment(). * Centralized DOM insertion logic (appendChild(), insertBefore(), and replaceChild()) into a single, efficient _insertNodesAtIndex() private helper, ensuring consistency. * Made replaceChild() more robust by simplifying its implementation and fixing an edge case where replacing a node with itself failed. * Fixed a bug in remove() where it incorrectly modified element-specific properties on non-element nodes. The test suite in test-jsdomparser.js was expanded to validate these improvements, with new tests for DocumentFragment handling, node moving, and self-insertion/replacement edge cases.
1 parent 117f220 commit d7949dc

5 files changed

Lines changed: 384 additions & 146 deletions

File tree

JSDOMParser.js

Lines changed: 153 additions & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -347,26 +347,129 @@
347347
return this.children[this.children.length - 1] || null;
348348
},
349349

350-
appendChild(child) {
351-
if (child.parentNode) {
352-
child.remove();
350+
/**
351+
* The workhorse for all node insertion operations. The public methods
352+
* (`appendChild()`, `insertBefore()`, `replaceChild()`) are thin wrappers
353+
* around this.
354+
*
355+
* @private
356+
* @param {Node[]} nodes - An array of nodes to insert. It is assumed that
357+
* these nodes are distinct, and are not children of this object.
358+
* @param {Number} index - A valid index to insert `nodes` at, or -1 to
359+
* indicate insertion as the last children.
360+
* @returns {void}
361+
*/
362+
_insertNodesAtIndex(nodes, index) {
363+
if (!nodes.length) {
364+
return;
365+
}
366+
367+
// Detach nodes from their previous parents.
368+
for (var i = 0; i < nodes.length; i++) {
369+
if (nodes[i].parentNode) {
370+
nodes[i].remove();
371+
}
353372
}
354373

355-
var last = this.lastChild;
356-
if (last) {
357-
last.nextSibling = child;
374+
var afterSibling = index === -1 ? null : this.childNodes[index];
375+
376+
// Store the previous sibling before we modify the DOM.
377+
var prevSibling = afterSibling
378+
? afterSibling.previousSibling
379+
: this.lastChild;
380+
381+
// Insert nodes into childNodes.
382+
var insertionPoint = index === -1 ? this.childNodes.length : index;
383+
Array.prototype.splice.apply(
384+
this.childNodes,
385+
[insertionPoint, 0].concat(nodes)
386+
);
387+
388+
// Update parentNode and sibling pointers for the new nodes.
389+
for (var j = 0; j < nodes.length; j++) {
390+
var node = nodes[j];
391+
node.parentNode = this;
392+
node.previousSibling = prevSibling;
393+
if (prevSibling) {
394+
prevSibling.nextSibling = node;
395+
}
396+
prevSibling = node;
397+
}
398+
var lastInsertedNode = nodes[nodes.length - 1];
399+
lastInsertedNode.nextSibling = afterSibling;
400+
if (afterSibling) {
401+
afterSibling.previousSibling = lastInsertedNode;
358402
}
359-
child.previousSibling = last;
360-
361-
if (child.nodeType === Node.ELEMENT_NODE) {
362-
child.previousElementSibling =
363-
this.children[this.children.length - 1] || null;
364-
this.children.push(child);
365-
child.previousElementSibling &&
366-
(child.previousElementSibling.nextElementSibling = child);
403+
404+
// Filter for element nodes and update children array and pointers.
405+
var elementsToInsert = [];
406+
for (var k = 0; k < nodes.length; k++) {
407+
if (nodes[k].nodeType === Node.ELEMENT_NODE) {
408+
elementsToInsert.push(nodes[k]);
409+
}
367410
}
368-
this.childNodes.push(child);
369-
child.parentNode = this;
411+
412+
if (elementsToInsert.length) {
413+
// Find the next element sibling to use as an insertion reference.
414+
// This is done after `childNodes` is modified, as the forward
415+
// traversal from `afterSibling` remains valid.
416+
var afterElem = afterSibling;
417+
while (afterElem && afterElem.nodeType !== Node.ELEMENT_NODE) {
418+
afterElem = afterElem.nextSibling;
419+
}
420+
421+
// Store the previous element sibling before more DOM modifications.
422+
var prevElem = afterElem
423+
? afterElem.previousElementSibling
424+
: this.lastElementChild;
425+
426+
var afterElemIndex = afterElem ? this.children.indexOf(afterElem) : -1;
427+
var elemInsertionPoint =
428+
afterElemIndex === -1 ? this.children.length : afterElemIndex;
429+
Array.prototype.splice.apply(
430+
this.children,
431+
[elemInsertionPoint, 0].concat(elementsToInsert)
432+
);
433+
434+
for (var l = 0; l < elementsToInsert.length; l++) {
435+
var elem = elementsToInsert[l];
436+
elem.previousElementSibling = prevElem;
437+
if (prevElem) {
438+
prevElem.nextElementSibling = elem;
439+
}
440+
prevElem = elem;
441+
}
442+
var lastInsertedElem = elementsToInsert[elementsToInsert.length - 1];
443+
lastInsertedElem.nextElementSibling = afterElem;
444+
if (afterElem) {
445+
afterElem.previousElementSibling = lastInsertedElem;
446+
}
447+
}
448+
},
449+
450+
appendChild(child) {
451+
var nodes =
452+
child.nodeType === Node.DOCUMENT_FRAGMENT_NODE
453+
? Array.from(child.childNodes)
454+
: [child];
455+
this._insertNodesAtIndex(nodes, -1);
456+
return child;
457+
},
458+
459+
insertBefore(newNode, referenceNode) {
460+
if (newNode === referenceNode) {
461+
return newNode;
462+
}
463+
var nodes =
464+
newNode.nodeType === Node.DOCUMENT_FRAGMENT_NODE
465+
? Array.from(newNode.childNodes)
466+
: [newNode];
467+
var index = referenceNode ? this.childNodes.indexOf(referenceNode) : -1;
468+
if (referenceNode && index === -1) {
469+
throw new Error("insertBefore: reference node not found");
470+
}
471+
this._insertNodesAtIndex(nodes, index);
472+
return newNode;
370473
},
371474

372475
remove() {
@@ -392,19 +495,19 @@
392495
childNodes.splice(childIndex, 1);
393496

394497
if (this.nodeType === Node.ELEMENT_NODE) {
395-
prev = this.previousElementSibling;
396-
next = this.nextElementSibling;
397-
if (prev) {
398-
prev.nextElementSibling = next;
498+
var prevElem = this.previousElementSibling;
499+
var nextElem = this.nextElementSibling;
500+
if (prevElem) {
501+
prevElem.nextElementSibling = nextElem;
399502
}
400-
if (next) {
401-
next.previousElementSibling = prev;
503+
if (nextElem) {
504+
nextElem.previousElementSibling = prevElem;
402505
}
403506
parent.children.splice(parent.children.indexOf(this), 1);
507+
this.previousElementSibling = this.nextElementSibling = null;
404508
}
405509

406510
this.previousSibling = this.nextSibling = null;
407-
this.previousElementSibling = this.nextElementSibling = null;
408511

409512
return this;
410513
},
@@ -414,110 +517,19 @@
414517
},
415518

416519
replaceChild(newNode, oldNode) {
417-
var childNodes = this.childNodes;
418-
var childIndex = childNodes.indexOf(oldNode);
419-
if (childIndex === -1) {
420-
throw new Error("replaceChild: node not found");
421-
} else {
422-
// This will take care of updating the new node if it was somewhere else before:
423-
if (newNode.parentNode) {
424-
newNode.remove();
425-
}
426-
427-
childNodes[childIndex] = newNode;
428-
429-
// update the new node's sibling properties, and its new siblings' sibling properties
430-
newNode.nextSibling = oldNode.nextSibling;
431-
newNode.previousSibling = oldNode.previousSibling;
432-
if (newNode.nextSibling) {
433-
newNode.nextSibling.previousSibling = newNode;
434-
}
435-
if (newNode.previousSibling) {
436-
newNode.previousSibling.nextSibling = newNode;
437-
}
438-
439-
newNode.parentNode = this;
440-
441-
// Now deal with elements before we clear out those values for the old node,
442-
// because it can help us take shortcuts here:
443-
if (newNode.nodeType === Node.ELEMENT_NODE) {
444-
if (oldNode.nodeType === Node.ELEMENT_NODE) {
445-
// Both were elements, which makes this easier, we just swap things out:
446-
newNode.previousElementSibling = oldNode.previousElementSibling;
447-
newNode.nextElementSibling = oldNode.nextElementSibling;
448-
if (newNode.previousElementSibling) {
449-
newNode.previousElementSibling.nextElementSibling = newNode;
450-
}
451-
if (newNode.nextElementSibling) {
452-
newNode.nextElementSibling.previousElementSibling = newNode;
453-
}
454-
this.children[this.children.indexOf(oldNode)] = newNode;
455-
} else {
456-
// Hard way:
457-
newNode.previousElementSibling = (function () {
458-
for (var i = childIndex - 1; i >= 0; i--) {
459-
if (childNodes[i].nodeType === Node.ELEMENT_NODE) {
460-
return childNodes[i];
461-
}
462-
}
463-
return null;
464-
})();
465-
if (newNode.previousElementSibling) {
466-
newNode.nextElementSibling =
467-
newNode.previousElementSibling.nextElementSibling;
468-
} else {
469-
newNode.nextElementSibling = (function () {
470-
for (var i = childIndex + 1; i < childNodes.length; i++) {
471-
if (childNodes[i].nodeType === Node.ELEMENT_NODE) {
472-
return childNodes[i];
473-
}
474-
}
475-
return null;
476-
})();
477-
}
478-
if (newNode.previousElementSibling) {
479-
newNode.previousElementSibling.nextElementSibling = newNode;
480-
}
481-
if (newNode.nextElementSibling) {
482-
newNode.nextElementSibling.previousElementSibling = newNode;
483-
}
484-
485-
if (newNode.nextElementSibling) {
486-
this.children.splice(
487-
this.children.indexOf(newNode.nextElementSibling),
488-
0,
489-
newNode
490-
);
491-
} else {
492-
this.children.push(newNode);
493-
}
494-
}
495-
} else if (oldNode.nodeType === Node.ELEMENT_NODE) {
496-
// new node is not an element node.
497-
// if the old one was, update its element siblings:
498-
if (oldNode.previousElementSibling) {
499-
oldNode.previousElementSibling.nextElementSibling =
500-
oldNode.nextElementSibling;
501-
}
502-
if (oldNode.nextElementSibling) {
503-
oldNode.nextElementSibling.previousElementSibling =
504-
oldNode.previousElementSibling;
505-
}
506-
this.children.splice(this.children.indexOf(oldNode), 1);
507-
508-
// If the old node wasn't an element, neither the new nor the old node was an element,
509-
// and the children array and its members shouldn't need any updating.
510-
}
511-
512-
oldNode.parentNode = null;
513-
oldNode.previousSibling = null;
514-
oldNode.nextSibling = null;
515-
if (oldNode.nodeType === Node.ELEMENT_NODE) {
516-
oldNode.previousElementSibling = null;
517-
oldNode.nextElementSibling = null;
518-
}
520+
if (newNode === oldNode) {
519521
return oldNode;
520522
}
523+
if (oldNode.parentNode !== this) {
524+
throw new Error(
525+
"replaceChild: node to be replaced is not a child of this node"
526+
);
527+
}
528+
// Insert the new node(s) before the node to be replaced.
529+
this.insertBefore(newNode, oldNode);
530+
// Now, remove the old node.
531+
oldNode.remove();
532+
return oldNode;
521533
},
522534

523535
__JSDOMParser__: true,
@@ -559,6 +571,17 @@
559571
nodeType: Node.COMMENT_NODE,
560572
};
561573

574+
var DocumentFragment = function () {
575+
this.childNodes = [];
576+
this.children = [];
577+
};
578+
579+
DocumentFragment.prototype = {
580+
__proto__: Node.prototype,
581+
nodeName: "#document-fragment",
582+
nodeType: Node.DOCUMENT_FRAGMENT_NODE,
583+
};
584+
562585
var Text = function () {
563586
this.childNodes = [];
564587
};
@@ -635,6 +658,10 @@
635658
return node;
636659
},
637660

661+
createDocumentFragment() {
662+
return new DocumentFragment();
663+
},
664+
638665
get baseURI() {
639666
if (!this.hasOwnProperty("_baseURI")) {
640667
this._baseURI = this.documentURI;
@@ -1264,6 +1291,7 @@
12641291
global.Node = Node;
12651292
global.Comment = Comment;
12661293
global.Document = Document;
1294+
global.DocumentFragment = DocumentFragment;
12671295
global.Element = Element;
12681296
global.Text = Text;
12691297

Readability.js

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1174,23 +1174,39 @@ Readability.prototype = {
11741174
// Turn all divs that don't have children block level elements into p's
11751175
if (node.tagName === "DIV") {
11761176
// Put phrasing content into paragraphs.
1177-
var p = null;
11781177
var childNode = node.firstChild;
11791178
while (childNode) {
11801179
var nextSibling = childNode.nextSibling;
11811180
if (this._isPhrasingContent(childNode)) {
1182-
if (p !== null) {
1183-
p.appendChild(childNode);
1184-
} else if (!this._isWhitespace(childNode)) {
1185-
p = doc.createElement("p");
1186-
node.replaceChild(p, childNode);
1187-
p.appendChild(childNode);
1181+
var fragment = doc.createDocumentFragment();
1182+
// Collect all consecutive phrasing content into a fragment.
1183+
do {
1184+
nextSibling = childNode.nextSibling;
1185+
fragment.appendChild(childNode);
1186+
childNode = nextSibling;
1187+
} while (childNode && this._isPhrasingContent(childNode));
1188+
1189+
// Trim leading and trailing whitespace from the fragment.
1190+
while (
1191+
fragment.firstChild &&
1192+
this._isWhitespace(fragment.firstChild)
1193+
) {
1194+
fragment.firstChild.remove();
11881195
}
1189-
} else if (p !== null) {
1190-
while (p.lastChild && this._isWhitespace(p.lastChild)) {
1191-
p.lastChild.remove();
1196+
while (
1197+
fragment.lastChild &&
1198+
this._isWhitespace(fragment.lastChild)
1199+
) {
1200+
fragment.lastChild.remove();
1201+
}
1202+
1203+
// If the fragment contains anything, wrap it in a paragraph and
1204+
// insert it before the next non-phrasing node.
1205+
if (fragment.firstChild) {
1206+
var p = doc.createElement("p");
1207+
p.appendChild(fragment);
1208+
node.insertBefore(p, nextSibling);
11921209
}
1193-
p = null;
11941210
}
11951211
childNode = nextSibling;
11961212
}

0 commit comments

Comments
 (0)