fix tests

inhumantsar · inhumantsar · commit d3d148dcaa6c · 2024-06-11T15:33:27.000-05:00
diff --git a/Readability.js b/Readability.js
@@ -1367,6 +1367,58 @@ Readability.prototype = {
     });
   },
 
+  _extractJSONLDMetadata: function (parsed) {
+    var metadata = {};
+
+    if (typeof parsed.name === "string" && typeof parsed.headline === "string" && parsed.name !== parsed.headline) {
+      // we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz
+      // put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either
+      // "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default.
+
+      var title = this._getArticleTitle();
+      var nameMatches = this._textSimilarity(parsed.name, title) > 0.75;
+      var headlineMatches = this._textSimilarity(parsed.headline, title) > 0.75;
+
+      if (headlineMatches && !nameMatches) {
+        metadata.title = parsed.headline;
+      } else {
+        metadata.title = parsed.name;
+      }
+    } else if (typeof parsed.name === "string") {
+      metadata.title = parsed.name.trim();
+    } else if (typeof parsed.headline === "string") {
+      metadata.title = parsed.headline.trim();
+    }
+    if (parsed.author) {
+      if (typeof parsed.author.name === "string") {
+        metadata.byline = parsed.author.name.trim();
+      } else if (Array.isArray(parsed.author) && parsed.author[0] && typeof parsed.author[0].name === "string") {
+        metadata.byline = parsed.author
+          .filter(function(author) {
+            return author && typeof author.name === "string";
+          })
+          .map(function(author) {
+            return author.name.trim();
+          })
+          .join(", ");
+      }
+    }
+    if (typeof parsed.description === "string") {
+      metadata.excerpt = parsed.description.trim();
+    }
+    if (
+      parsed.publisher &&
+      typeof parsed.publisher.name === "string"
+    ) {
+      metadata.siteName = parsed.publisher.name.trim();
+    }
+    if (typeof parsed.datePublished === "string") {
+      metadata.datePublished = parsed.datePublished.trim();
+    }
+
+    return metadata;
+  },
+
   /**
    * Try to extract metadata from JSON-LD object.
    * For now, only Schema.org objects of type Article or its subtypes are supported.
@@ -1384,13 +1436,6 @@ Readability.prototype = {
           var content = jsonLdElement.textContent.replace(/^\s*<!\[CDATA\[|\]\]>\s*$/g, "");
           var parsed = JSON.parse(content);
 
-          // some sites, like ones for academic journals, separate metadata for a journal article or paper from the
-          // site's own metadata. eg: nature has only @context, @type (WebPage), and mainEntity so *all* relevant metadata
-          // would be invisible without this.
-          if (parsed["mainEntity"]) {
-            parsed = parsed["mainEntity"];
-          }
-
           if (
             !parsed["@context"] ||
             !parsed["@context"].match(/^https?\:\/\/schema\.org\/?$/)
@@ -1414,54 +1459,15 @@ Readability.prototype = {
             return;
           }
 
-          metadata = {};
-
-          if (typeof parsed.name === "string" && typeof parsed.headline === "string" && parsed.name !== parsed.headline) {
-            // we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz
-            // put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either
-            // "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default.
+          metadata = this._extractJSONLDMetadata(parsed);
 
-            var title = this._getArticleTitle();
-            var nameMatches = this._textSimilarity(parsed.name, title) > 0.75;
-            var headlineMatches = this._textSimilarity(parsed.headline, title) > 0.75;
-
-            if (headlineMatches && !nameMatches) {
-              metadata.title = parsed.headline;
-            } else {
-              metadata.title = parsed.name;
-            }
-          } else if (typeof parsed.name === "string") {
-            metadata.title = parsed.name.trim();
-          } else if (typeof parsed.headline === "string") {
-            metadata.title = parsed.headline.trim();
-          }
-          if (parsed.author) {
-            if (typeof parsed.author.name === "string") {
-              metadata.byline = parsed.author.name.trim();
-            } else if (Array.isArray(parsed.author) && parsed.author[0] && typeof parsed.author[0].name === "string") {
-              metadata.byline = parsed.author
-                .filter(function(author) {
-                  return author && typeof author.name === "string";
-                })
-                .map(function(author) {
-                  return author.name.trim();
-                })
-                .join(", ");
-            }
-          }
-          if (typeof parsed.description === "string") {
-            metadata.excerpt = parsed.description.trim();
-          }
-          if (
-            parsed.publisher &&
-            typeof parsed.publisher.name === "string"
-          ) {
-            metadata.siteName = parsed.publisher.name.trim();
-          }
-          if (typeof parsed.datePublished === "string") {
-            metadata.datePublished = parsed.datePublished.trim();
+          // some sites, like ones for academic journals, separate metadata for a journal article or paper from the
+          // site's own metadata. eg: nature has only @context, @type (WebPage), and mainEntity so *all* relevant metadata
+          // would be invisible unless we retry using mainEntity.
+          if (parsed["mainEntity"] && Object.keys(metadata).length === 0) {
+            metadata = this._extractJSONLDMetadata(parsed["mainEntity"]);
           }
-          return;
+
         } catch (err) {
           this.log(err.message);
         }
@@ -1477,25 +1483,18 @@ Readability.prototype = {
    * @returns Name or names in "GivenName Surname" format
    */
   _normalizeByline: function(name) {
+    if (!name) {
+      return name;
+    }
+
     var result = name;
 
     if (Array.isArray(name)) {
       return name.map((n) => this._normalizeByline(n));
     }
 
-    // handle Surname, GivenName formatting
-    if (name.includes(",")) {
-      const parts = name.split(",").map(part => part.trim());
-      if (parts.length == 2) {
-        result = `${parts[1]} ${parts[0]}`;
-      }
-      if (parts.length > 2) {
-        result = `${parts[1]} ${parts[0]} ${parts.slice(2).join(" ")}`;
-      }
-    }
-
-    // remove things like "By:"
-    result = result.replace(/\w+:/, "");
+    // remove things like "By:" and "http://"
+    result = result.replace(/\w+:\/{0,2}/, "");
 
     return this._unescapeHtmlEntities(result);
   },
@@ -1519,6 +1518,11 @@ Readability.prototype = {
     // name is a single value
     var namePattern = /^\s*(?:(prism|citation|dc|dcterm|og|twitter|parsely|weibo:(article|webpage))\s*[-_\.:]\s*)?(author|creator|pub-date|publicationDate|publication|description|title|site_name)\s*$/i;
 
+    // fields which are permitted to have multiple distinct values, eg: byline
+    var byline_properties = [ "dc:creator", "dcterm:creator", "author", "parsely-author", "citation_author"];
+    var multi_props = byline_properties; // concat others here. somewhat pointless atm, but there will be more...
+
+
     // Find description tags.
     this._forEachNode(metaElements, function(element) {
       var elementName = element.getAttribute("name");
@@ -1551,18 +1555,19 @@ Readability.prototype = {
         }
       }
 
-      // handle properties which might have multiple distinct values, eg: citation_author
       if (result) {
-        if (values[name]) {
+        // handle properties which might have multiple distinct values
+        if (values[name] && multi_props.includes(name)) {
           if (Array.isArray(values[name]) && typeof result == "string") {
             values[name].push(result);
           }
-          if (typeof values[name] == "string") {
+          if (typeof values[name] == "string" && values[name] !== result) {
             values[name] = [values[name], result];
           }
         } else {
           values[name] = result;
         }
+
         this.log(`found metadata: ${name}=${values[name]}`);
       }
     });
@@ -1583,12 +1588,12 @@ Readability.prototype = {
     }
 
     // get author
-    metadata.byline = jsonld.byline ||
-                      values["dc:creator"] ||
-                      values["dcterm:creator"] ||
-                      values["author"] ||
-                      values["parsely-author"] ||
-                      values["citation_author"];
+    metadata.byline = jsonld.byline;
+    for (const n of byline_properties) {
+      if (metadata.byline)
+        break;
+      metadata.byline = values[n];
+    }
 
     // get description
     metadata.excerpt = jsonld.excerpt ||
@@ -1619,6 +1624,7 @@ Readability.prototype = {
     metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt);
     metadata.siteName = this._unescapeHtmlEntities(metadata.siteName);
     metadata.publishedTime = this._unescapeHtmlEntities(metadata.publishedTime);
+    this.log(`getArticleMetadata complete: ${JSON.stringify(metadata)}`);
 
     return metadata;
   },
diff --git a/test/test-pages/003-metadata-preferred/expected-metadata.json b/test/test-pages/003-metadata-preferred/expected-metadata.json
@@ -1,6 +1,6 @@
 {
   "title": "Dublin Core property title",
-  "byline": "Dublin Core property author",
+  "byline": "Dublin Core author",
   "dir": null,
   "excerpt": "Dublin Core property description",
   "siteName": null,
diff --git a/test/test-pages/003-metadata-preferred/source.html b/test/test-pages/003-metadata-preferred/source.html
@@ -11,9 +11,11 @@
     <meta property="twitter:title" content="Twitter property title"/>
     <meta property="og:title" content="Open Graph property title"/>
     <meta name="author" content="Meta name author"/>
-    <meta name="DC.creator" content="Dublin Core name author"/>
-    <meta property="dc:creator" content="Dublin Core property author"/>
-     <meta name="description" content="Meta name description"/>
+    <!-- now that multiple authors are supported, these have to be identical to prevent them from showing up
+         as two separate authors -->
+    <meta name="DC.creator" content="Dublin Core author"/>
+    <meta property="dc:creator" content="Dublin Core author"/>
+    <meta name="description" content="Meta name description"/>
     <meta name="og:description" content="Open Graph name description"/>
     <meta name="twitter:description" content="Twitter name description"/>
     <meta name="DC.description" content="Dublin Core name description"/>
diff --git a/test/test-pages/ebb-org/expected-metadata.json b/test/test-pages/ebb-org/expected-metadata.json
@@ -1,6 +1,6 @@
 {
   "title": "On Recent Controversial Events - Bradley M. Kuhn ( Brad ) ( bkuhn )",
-  "byline": "Bradley M. Kuhn (http://ebb.org/bkuhn/)",
+  "byline": "Bradley M. Kuhn (ebb.org/bkuhn/)",
   "dir": null,
   "lang": "en-US",
   "excerpt": "The website of Bradley M. Kuhn, aka Brad, aka bkuhn. This site includes his GPG keys, resume, blog, projects list, software, interviews, speeches and writing.",
diff --git a/test/test-pages/ietf-1/expected-metadata.json b/test/test-pages/ietf-1/expected-metadata.json
@@ -1,6 +1,9 @@
 {
   "title": "remoteStorage",
-  "byline": "Jong, Michiel de",
+  "byline": [
+    "Kooman, F.",
+    "Jong, Michiel de"
+  ],
   "dir": null,
   "lang": "en",
   "siteName": null,
diff --git a/test/test-pages/nature/expected-metadata.json b/test/test-pages/nature/expected-metadata.json
@@ -1,8 +1,8 @@
 {
   "title": "Worldwide divergence of values",
   "byline": [
-    "Joshua Conrad Jackson",
-    "Danila Medvedev"
+    "Jackson, Joshua Conrad",
+    "Medvedev, Danila"
   ],
   "dir": null,
   "lang": "en",

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"title": "Dublin Core property title",`
`3`		`- "byline": "Dublin Core property author",`
	`3`	`+ "byline": "Dublin Core author",`
`4`	`4`	`"dir": null,`
`5`	`5`	`"excerpt": "Dublin Core property description",`
`6`	`6`	`"siteName": null,`
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"title": "On Recent Controversial Events - Bradley M. Kuhn ( Brad ) ( bkuhn )",`
`3`		`- "byline": "Bradley M. Kuhn (http://ebb.org/bkuhn/)",`
	`3`	`+ "byline": "Bradley M. Kuhn (ebb.org/bkuhn/)",`
`4`	`4`	`"dir": null,`
`5`	`5`	`"lang": "en-US",`
`6`	`6`	`"excerpt": "The website of Bradley M. Kuhn, aka Brad, aka bkuhn. This site includes his GPG keys, resume, blog, projects list, software, interviews, speeches and writing.",`