Skip to content

Commit d3d148d

Browse files
committed
fix tests
1 parent 729dca3 commit d3d148d

6 files changed

Lines changed: 94 additions & 83 deletions

File tree

Readability.js

Lines changed: 81 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -1367,6 +1367,58 @@ Readability.prototype = {
13671367
});
13681368
},
13691369

1370+
_extractJSONLDMetadata: function (parsed) {
1371+
var metadata = {};
1372+
1373+
if (typeof parsed.name === "string" && typeof parsed.headline === "string" && parsed.name !== parsed.headline) {
1374+
// we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz
1375+
// put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either
1376+
// "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default.
1377+
1378+
var title = this._getArticleTitle();
1379+
var nameMatches = this._textSimilarity(parsed.name, title) > 0.75;
1380+
var headlineMatches = this._textSimilarity(parsed.headline, title) > 0.75;
1381+
1382+
if (headlineMatches && !nameMatches) {
1383+
metadata.title = parsed.headline;
1384+
} else {
1385+
metadata.title = parsed.name;
1386+
}
1387+
} else if (typeof parsed.name === "string") {
1388+
metadata.title = parsed.name.trim();
1389+
} else if (typeof parsed.headline === "string") {
1390+
metadata.title = parsed.headline.trim();
1391+
}
1392+
if (parsed.author) {
1393+
if (typeof parsed.author.name === "string") {
1394+
metadata.byline = parsed.author.name.trim();
1395+
} else if (Array.isArray(parsed.author) && parsed.author[0] && typeof parsed.author[0].name === "string") {
1396+
metadata.byline = parsed.author
1397+
.filter(function(author) {
1398+
return author && typeof author.name === "string";
1399+
})
1400+
.map(function(author) {
1401+
return author.name.trim();
1402+
})
1403+
.join(", ");
1404+
}
1405+
}
1406+
if (typeof parsed.description === "string") {
1407+
metadata.excerpt = parsed.description.trim();
1408+
}
1409+
if (
1410+
parsed.publisher &&
1411+
typeof parsed.publisher.name === "string"
1412+
) {
1413+
metadata.siteName = parsed.publisher.name.trim();
1414+
}
1415+
if (typeof parsed.datePublished === "string") {
1416+
metadata.datePublished = parsed.datePublished.trim();
1417+
}
1418+
1419+
return metadata;
1420+
},
1421+
13701422
/**
13711423
* Try to extract metadata from JSON-LD object.
13721424
* For now, only Schema.org objects of type Article or its subtypes are supported.
@@ -1384,13 +1436,6 @@ Readability.prototype = {
13841436
var content = jsonLdElement.textContent.replace(/^\s*<!\[CDATA\[|\]\]>\s*$/g, "");
13851437
var parsed = JSON.parse(content);
13861438

1387-
// some sites, like ones for academic journals, separate metadata for a journal article or paper from the
1388-
// site's own metadata. eg: nature has only @context, @type (WebPage), and mainEntity so *all* relevant metadata
1389-
// would be invisible without this.
1390-
if (parsed["mainEntity"]) {
1391-
parsed = parsed["mainEntity"];
1392-
}
1393-
13941439
if (
13951440
!parsed["@context"] ||
13961441
!parsed["@context"].match(/^https?\:\/\/schema\.org\/?$/)
@@ -1414,54 +1459,15 @@ Readability.prototype = {
14141459
return;
14151460
}
14161461

1417-
metadata = {};
1418-
1419-
if (typeof parsed.name === "string" && typeof parsed.headline === "string" && parsed.name !== parsed.headline) {
1420-
// we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz
1421-
// put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either
1422-
// "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default.
1462+
metadata = this._extractJSONLDMetadata(parsed);
14231463

1424-
var title = this._getArticleTitle();
1425-
var nameMatches = this._textSimilarity(parsed.name, title) > 0.75;
1426-
var headlineMatches = this._textSimilarity(parsed.headline, title) > 0.75;
1427-
1428-
if (headlineMatches && !nameMatches) {
1429-
metadata.title = parsed.headline;
1430-
} else {
1431-
metadata.title = parsed.name;
1432-
}
1433-
} else if (typeof parsed.name === "string") {
1434-
metadata.title = parsed.name.trim();
1435-
} else if (typeof parsed.headline === "string") {
1436-
metadata.title = parsed.headline.trim();
1437-
}
1438-
if (parsed.author) {
1439-
if (typeof parsed.author.name === "string") {
1440-
metadata.byline = parsed.author.name.trim();
1441-
} else if (Array.isArray(parsed.author) && parsed.author[0] && typeof parsed.author[0].name === "string") {
1442-
metadata.byline = parsed.author
1443-
.filter(function(author) {
1444-
return author && typeof author.name === "string";
1445-
})
1446-
.map(function(author) {
1447-
return author.name.trim();
1448-
})
1449-
.join(", ");
1450-
}
1451-
}
1452-
if (typeof parsed.description === "string") {
1453-
metadata.excerpt = parsed.description.trim();
1454-
}
1455-
if (
1456-
parsed.publisher &&
1457-
typeof parsed.publisher.name === "string"
1458-
) {
1459-
metadata.siteName = parsed.publisher.name.trim();
1460-
}
1461-
if (typeof parsed.datePublished === "string") {
1462-
metadata.datePublished = parsed.datePublished.trim();
1464+
// some sites, like ones for academic journals, separate metadata for a journal article or paper from the
1465+
// site's own metadata. eg: nature has only @context, @type (WebPage), and mainEntity so *all* relevant metadata
1466+
// would be invisible unless we retry using mainEntity.
1467+
if (parsed["mainEntity"] && Object.keys(metadata).length === 0) {
1468+
metadata = this._extractJSONLDMetadata(parsed["mainEntity"]);
14631469
}
1464-
return;
1470+
14651471
} catch (err) {
14661472
this.log(err.message);
14671473
}
@@ -1477,25 +1483,18 @@ Readability.prototype = {
14771483
* @returns Name or names in "GivenName Surname" format
14781484
*/
14791485
_normalizeByline: function(name) {
1486+
if (!name) {
1487+
return name;
1488+
}
1489+
14801490
var result = name;
14811491

14821492
if (Array.isArray(name)) {
14831493
return name.map((n) => this._normalizeByline(n));
14841494
}
14851495

1486-
// handle Surname, GivenName formatting
1487-
if (name.includes(",")) {
1488-
const parts = name.split(",").map(part => part.trim());
1489-
if (parts.length == 2) {
1490-
result = `${parts[1]} ${parts[0]}`;
1491-
}
1492-
if (parts.length > 2) {
1493-
result = `${parts[1]} ${parts[0]} ${parts.slice(2).join(" ")}`;
1494-
}
1495-
}
1496-
1497-
// remove things like "By:"
1498-
result = result.replace(/\w+:/, "");
1496+
// remove things like "By:" and "http://"
1497+
result = result.replace(/\w+:\/{0,2}/, "");
14991498

15001499
return this._unescapeHtmlEntities(result);
15011500
},
@@ -1519,6 +1518,11 @@ Readability.prototype = {
15191518
// name is a single value
15201519
var namePattern = /^\s*(?:(prism|citation|dc|dcterm|og|twitter|parsely|weibo:(article|webpage))\s*[-_\.:]\s*)?(author|creator|pub-date|publicationDate|publication|description|title|site_name)\s*$/i;
15211520

1521+
// fields which are permitted to have multiple distinct values, eg: byline
1522+
var byline_properties = [ "dc:creator", "dcterm:creator", "author", "parsely-author", "citation_author"];
1523+
var multi_props = byline_properties; // concat others here. somewhat pointless atm, but there will be more...
1524+
1525+
15221526
// Find description tags.
15231527
this._forEachNode(metaElements, function(element) {
15241528
var elementName = element.getAttribute("name");
@@ -1551,18 +1555,19 @@ Readability.prototype = {
15511555
}
15521556
}
15531557

1554-
// handle properties which might have multiple distinct values, eg: citation_author
15551558
if (result) {
1556-
if (values[name]) {
1559+
// handle properties which might have multiple distinct values
1560+
if (values[name] && multi_props.includes(name)) {
15571561
if (Array.isArray(values[name]) && typeof result == "string") {
15581562
values[name].push(result);
15591563
}
1560-
if (typeof values[name] == "string") {
1564+
if (typeof values[name] == "string" && values[name] !== result) {
15611565
values[name] = [values[name], result];
15621566
}
15631567
} else {
15641568
values[name] = result;
15651569
}
1570+
15661571
this.log(`found metadata: ${name}=${values[name]}`);
15671572
}
15681573
});
@@ -1583,12 +1588,12 @@ Readability.prototype = {
15831588
}
15841589

15851590
// get author
1586-
metadata.byline = jsonld.byline ||
1587-
values["dc:creator"] ||
1588-
values["dcterm:creator"] ||
1589-
values["author"] ||
1590-
values["parsely-author"] ||
1591-
values["citation_author"];
1591+
metadata.byline = jsonld.byline;
1592+
for (const n of byline_properties) {
1593+
if (metadata.byline)
1594+
break;
1595+
metadata.byline = values[n];
1596+
}
15921597

15931598
// get description
15941599
metadata.excerpt = jsonld.excerpt ||
@@ -1619,6 +1624,7 @@ Readability.prototype = {
16191624
metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt);
16201625
metadata.siteName = this._unescapeHtmlEntities(metadata.siteName);
16211626
metadata.publishedTime = this._unescapeHtmlEntities(metadata.publishedTime);
1627+
this.log(`getArticleMetadata complete: ${JSON.stringify(metadata)}`);
16221628

16231629
return metadata;
16241630
},

test/test-pages/003-metadata-preferred/expected-metadata.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"title": "Dublin Core property title",
3-
"byline": "Dublin Core property author",
3+
"byline": "Dublin Core author",
44
"dir": null,
55
"excerpt": "Dublin Core property description",
66
"siteName": null,

test/test-pages/003-metadata-preferred/source.html

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,11 @@
1111
<meta property="twitter:title" content="Twitter property title"/>
1212
<meta property="og:title" content="Open Graph property title"/>
1313
<meta name="author" content="Meta name author"/>
14-
<meta name="DC.creator" content="Dublin Core name author"/>
15-
<meta property="dc:creator" content="Dublin Core property author"/>
16-
<meta name="description" content="Meta name description"/>
14+
<!-- now that multiple authors are supported, these have to be identical to prevent them from showing up
15+
as two separate authors -->
16+
<meta name="DC.creator" content="Dublin Core author"/>
17+
<meta property="dc:creator" content="Dublin Core author"/>
18+
<meta name="description" content="Meta name description"/>
1719
<meta name="og:description" content="Open Graph name description"/>
1820
<meta name="twitter:description" content="Twitter name description"/>
1921
<meta name="DC.description" content="Dublin Core name description"/>

test/test-pages/ebb-org/expected-metadata.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"title": "On Recent Controversial Events - Bradley M. Kuhn ( Brad ) ( bkuhn )",
3-
"byline": "Bradley M. Kuhn (http://ebb.org/bkuhn/)",
3+
"byline": "Bradley M. Kuhn (ebb.org/bkuhn/)",
44
"dir": null,
55
"lang": "en-US",
66
"excerpt": "The website of Bradley M. Kuhn, aka Brad, aka bkuhn. This site includes his GPG keys, resume, blog, projects list, software, interviews, speeches and writing.",

test/test-pages/ietf-1/expected-metadata.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
{
22
"title": "remoteStorage",
3-
"byline": "Jong, Michiel de",
3+
"byline": [
4+
"Kooman, F.",
5+
"Jong, Michiel de"
6+
],
47
"dir": null,
58
"lang": "en",
69
"siteName": null,

test/test-pages/nature/expected-metadata.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
{
22
"title": "Worldwide divergence of values",
33
"byline": [
4-
"Joshua Conrad Jackson",
5-
"Danila Medvedev"
4+
"Jackson, Joshua Conrad",
5+
"Medvedev, Danila"
66
],
77
"dir": null,
88
"lang": "en",

0 commit comments

Comments
 (0)