@@ -1367,6 +1367,58 @@ Readability.prototype = {
13671367 } ) ;
13681368 } ,
13691369
1370+ _extractJSONLDMetadata : function ( parsed ) {
1371+ var metadata = { } ;
1372+
1373+ if ( typeof parsed . name === "string" && typeof parsed . headline === "string" && parsed . name !== parsed . headline ) {
1374+ // we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz
1375+ // put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either
1376+ // "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default.
1377+
1378+ var title = this . _getArticleTitle ( ) ;
1379+ var nameMatches = this . _textSimilarity ( parsed . name , title ) > 0.75 ;
1380+ var headlineMatches = this . _textSimilarity ( parsed . headline , title ) > 0.75 ;
1381+
1382+ if ( headlineMatches && ! nameMatches ) {
1383+ metadata . title = parsed . headline ;
1384+ } else {
1385+ metadata . title = parsed . name ;
1386+ }
1387+ } else if ( typeof parsed . name === "string" ) {
1388+ metadata . title = parsed . name . trim ( ) ;
1389+ } else if ( typeof parsed . headline === "string" ) {
1390+ metadata . title = parsed . headline . trim ( ) ;
1391+ }
1392+ if ( parsed . author ) {
1393+ if ( typeof parsed . author . name === "string" ) {
1394+ metadata . byline = parsed . author . name . trim ( ) ;
1395+ } else if ( Array . isArray ( parsed . author ) && parsed . author [ 0 ] && typeof parsed . author [ 0 ] . name === "string" ) {
1396+ metadata . byline = parsed . author
1397+ . filter ( function ( author ) {
1398+ return author && typeof author . name === "string" ;
1399+ } )
1400+ . map ( function ( author ) {
1401+ return author . name . trim ( ) ;
1402+ } )
1403+ . join ( ", " ) ;
1404+ }
1405+ }
1406+ if ( typeof parsed . description === "string" ) {
1407+ metadata . excerpt = parsed . description . trim ( ) ;
1408+ }
1409+ if (
1410+ parsed . publisher &&
1411+ typeof parsed . publisher . name === "string"
1412+ ) {
1413+ metadata . siteName = parsed . publisher . name . trim ( ) ;
1414+ }
1415+ if ( typeof parsed . datePublished === "string" ) {
1416+ metadata . datePublished = parsed . datePublished . trim ( ) ;
1417+ }
1418+
1419+ return metadata ;
1420+ } ,
1421+
13701422 /**
13711423 * Try to extract metadata from JSON-LD object.
13721424 * For now, only Schema.org objects of type Article or its subtypes are supported.
@@ -1384,13 +1436,6 @@ Readability.prototype = {
13841436 var content = jsonLdElement . textContent . replace ( / ^ \s * < ! \[ C D A T A \[ | \] \] > \s * $ / g, "" ) ;
13851437 var parsed = JSON . parse ( content ) ;
13861438
1387- // some sites, like ones for academic journals, separate metadata for a journal article or paper from the
1388- // site's own metadata. eg: nature has only @context, @type (WebPage), and mainEntity so *all* relevant metadata
1389- // would be invisible without this.
1390- if ( parsed [ "mainEntity" ] ) {
1391- parsed = parsed [ "mainEntity" ] ;
1392- }
1393-
13941439 if (
13951440 ! parsed [ "@context" ] ||
13961441 ! parsed [ "@context" ] . match ( / ^ h t t p s ? \: \/ \/ s c h e m a \. o r g \/ ? $ / )
@@ -1414,54 +1459,15 @@ Readability.prototype = {
14141459 return ;
14151460 }
14161461
1417- metadata = { } ;
1418-
1419- if ( typeof parsed . name === "string" && typeof parsed . headline === "string" && parsed . name !== parsed . headline ) {
1420- // we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz
1421- // put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either
1422- // "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default.
1462+ metadata = this . _extractJSONLDMetadata ( parsed ) ;
14231463
1424- var title = this . _getArticleTitle ( ) ;
1425- var nameMatches = this . _textSimilarity ( parsed . name , title ) > 0.75 ;
1426- var headlineMatches = this . _textSimilarity ( parsed . headline , title ) > 0.75 ;
1427-
1428- if ( headlineMatches && ! nameMatches ) {
1429- metadata . title = parsed . headline ;
1430- } else {
1431- metadata . title = parsed . name ;
1432- }
1433- } else if ( typeof parsed . name === "string" ) {
1434- metadata . title = parsed . name . trim ( ) ;
1435- } else if ( typeof parsed . headline === "string" ) {
1436- metadata . title = parsed . headline . trim ( ) ;
1437- }
1438- if ( parsed . author ) {
1439- if ( typeof parsed . author . name === "string" ) {
1440- metadata . byline = parsed . author . name . trim ( ) ;
1441- } else if ( Array . isArray ( parsed . author ) && parsed . author [ 0 ] && typeof parsed . author [ 0 ] . name === "string" ) {
1442- metadata . byline = parsed . author
1443- . filter ( function ( author ) {
1444- return author && typeof author . name === "string" ;
1445- } )
1446- . map ( function ( author ) {
1447- return author . name . trim ( ) ;
1448- } )
1449- . join ( ", " ) ;
1450- }
1451- }
1452- if ( typeof parsed . description === "string" ) {
1453- metadata . excerpt = parsed . description . trim ( ) ;
1454- }
1455- if (
1456- parsed . publisher &&
1457- typeof parsed . publisher . name === "string"
1458- ) {
1459- metadata . siteName = parsed . publisher . name . trim ( ) ;
1460- }
1461- if ( typeof parsed . datePublished === "string" ) {
1462- metadata . datePublished = parsed . datePublished . trim ( ) ;
1464+ // some sites, like ones for academic journals, separate metadata for a journal article or paper from the
1465+ // site's own metadata. eg: nature has only @context, @type (WebPage), and mainEntity so *all* relevant metadata
1466+ // would be invisible unless we retry using mainEntity.
1467+ if ( parsed [ "mainEntity" ] && Object . keys ( metadata ) . length === 0 ) {
1468+ metadata = this . _extractJSONLDMetadata ( parsed [ "mainEntity" ] ) ;
14631469 }
1464- return ;
1470+
14651471 } catch ( err ) {
14661472 this . log ( err . message ) ;
14671473 }
@@ -1477,25 +1483,18 @@ Readability.prototype = {
14771483 * @returns Name or names in "GivenName Surname" format
14781484 */
14791485 _normalizeByline : function ( name ) {
1486+ if ( ! name ) {
1487+ return name ;
1488+ }
1489+
14801490 var result = name ;
14811491
14821492 if ( Array . isArray ( name ) ) {
14831493 return name . map ( ( n ) => this . _normalizeByline ( n ) ) ;
14841494 }
14851495
1486- // handle Surname, GivenName formatting
1487- if ( name . includes ( "," ) ) {
1488- const parts = name . split ( "," ) . map ( part => part . trim ( ) ) ;
1489- if ( parts . length == 2 ) {
1490- result = `${ parts [ 1 ] } ${ parts [ 0 ] } ` ;
1491- }
1492- if ( parts . length > 2 ) {
1493- result = `${ parts [ 1 ] } ${ parts [ 0 ] } ${ parts . slice ( 2 ) . join ( " " ) } ` ;
1494- }
1495- }
1496-
1497- // remove things like "By:"
1498- result = result . replace ( / \w + : / , "" ) ;
1496+ // remove things like "By:" and "http://"
1497+ result = result . replace ( / \w + : \/ { 0 , 2 } / , "" ) ;
14991498
15001499 return this . _unescapeHtmlEntities ( result ) ;
15011500 } ,
@@ -1519,6 +1518,11 @@ Readability.prototype = {
15191518 // name is a single value
15201519 var namePattern = / ^ \s * (?: ( p r i s m | c i t a t i o n | d c | d c t e r m | o g | t w i t t e r | p a r s e l y | w e i b o : ( a r t i c l e | w e b p a g e ) ) \s * [ - _ \. : ] \s * ) ? ( a u t h o r | c r e a t o r | p u b - d a t e | p u b l i c a t i o n D a t e | p u b l i c a t i o n | d e s c r i p t i o n | t i t l e | s i t e _ n a m e ) \s * $ / i;
15211520
1521+ // fields which are permitted to have multiple distinct values, eg: byline
1522+ var byline_properties = [ "dc:creator" , "dcterm:creator" , "author" , "parsely-author" , "citation_author" ] ;
1523+ var multi_props = byline_properties ; // concat others here. somewhat pointless atm, but there will be more...
1524+
1525+
15221526 // Find description tags.
15231527 this . _forEachNode ( metaElements , function ( element ) {
15241528 var elementName = element . getAttribute ( "name" ) ;
@@ -1551,18 +1555,19 @@ Readability.prototype = {
15511555 }
15521556 }
15531557
1554- // handle properties which might have multiple distinct values, eg: citation_author
15551558 if ( result ) {
1556- if ( values [ name ] ) {
1559+ // handle properties which might have multiple distinct values
1560+ if ( values [ name ] && multi_props . includes ( name ) ) {
15571561 if ( Array . isArray ( values [ name ] ) && typeof result == "string" ) {
15581562 values [ name ] . push ( result ) ;
15591563 }
1560- if ( typeof values [ name ] == "string" ) {
1564+ if ( typeof values [ name ] == "string" && values [ name ] !== result ) {
15611565 values [ name ] = [ values [ name ] , result ] ;
15621566 }
15631567 } else {
15641568 values [ name ] = result ;
15651569 }
1570+
15661571 this . log ( `found metadata: ${ name } =${ values [ name ] } ` ) ;
15671572 }
15681573 } ) ;
@@ -1583,12 +1588,12 @@ Readability.prototype = {
15831588 }
15841589
15851590 // get author
1586- metadata . byline = jsonld . byline ||
1587- values [ "dc:creator" ] ||
1588- values [ "dcterm:creator" ] ||
1589- values [ "author" ] ||
1590- values [ "parsely-author" ] ||
1591- values [ "citation_author" ] ;
1591+ metadata . byline = jsonld . byline ;
1592+ for ( const n of byline_properties ) {
1593+ if ( metadata . byline )
1594+ break ;
1595+ metadata . byline = values [ n ] ;
1596+ }
15921597
15931598 // get description
15941599 metadata . excerpt = jsonld . excerpt ||
@@ -1619,6 +1624,7 @@ Readability.prototype = {
16191624 metadata . excerpt = this . _unescapeHtmlEntities ( metadata . excerpt ) ;
16201625 metadata . siteName = this . _unescapeHtmlEntities ( metadata . siteName ) ;
16211626 metadata . publishedTime = this . _unescapeHtmlEntities ( metadata . publishedTime ) ;
1627+ this . log ( `getArticleMetadata complete: ${ JSON . stringify ( metadata ) } ` ) ;
16221628
16231629 return metadata ;
16241630 } ,
0 commit comments