Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/coding-standards.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ env:
jobs:
coding-standards:
name: "CS Fixer & PHPStan"
runs-on: "ubuntu-22.04"
runs-on: "ubuntu-latest"

strategy:
matrix:
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/continuous-integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ env:
jobs:
phpunit:
name: "PHPUnit (PHP ${{ matrix.php }})"
runs-on: "ubuntu-22.04"
runs-on: "ubuntu-latest"

strategy:
matrix:
Expand Down Expand Up @@ -66,7 +66,7 @@ jobs:

phpunit-coverage:
name: "PHPUnit coverage (PHP ${{ matrix.php }})"
runs-on: "ubuntu-22.04"
runs-on: "ubuntu-latest"

strategy:
matrix:
Expand Down Expand Up @@ -117,7 +117,7 @@ jobs:

phpunit-lowest:
name: "PHPUnit lowest deps (PHP ${{ matrix.php }})"
runs-on: "ubuntu-22.04"
runs-on: "ubuntu-latest"

strategy:
matrix:
Expand Down Expand Up @@ -158,7 +158,7 @@ jobs:

phpunit-composerv2:
name: "PHPUnit with Composer v1 (PHP ${{ matrix.php }})"
runs-on: "ubuntu-20.04"
runs-on: "ubuntu-latest"

strategy:
matrix:
Expand Down
38 changes: 7 additions & 31 deletions src/Readability.php
Original file line number Diff line number Diff line change
Expand Up @@ -1435,7 +1435,7 @@ private function loadHtml()
unset($tidy);
}

$this->html = self::ensureMetaCharset((string) $this->html);
$this->html = self::entitizeNonAscii((string) $this->html);

if ('html5lib' === $this->parser || 'html5' === $this->parser) {
$this->dom = (new HTML5())->loadHTML($this->html);
Expand All @@ -1455,43 +1455,19 @@ private function loadHtml()
}

/**
* Tries to insert `meta[charset]` tag into the proper place in the passed HTML document.
* Converts non-ASCII UTF-8 characters to numeric HTML entities.
*
* `DOMDocument::loadHTML` will parse HTML documents as ISO-8859-1 if there is no `meta[charset]` tag.
* This means that UTF-8-encoded HTML fragments such as those coming from JSON-LD `articleBody` field would be parsed with incorrect encoding.
* Unfortunately, we cannot just put the tag at the start of the HTML fragment, since that would cause parser to auto-insert a `html` element, losing the attributes of the original `html` tag.
*
* @param string $html UTF-8 encoded document
*/
private static function ensureMetaCharset($html)
private static function entitizeNonAscii($html)
{
$charsetTag = '<meta charset="utf-8">';
$convmap = [
0x80, 0x1FFFFF, 0, 0x10FFFF,
];

// Only look at first 1024 bytes since, according to HTML5 specification,
// that’s where <meta> elements declaring a character encoding must be located.
// https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta#charset
$start = substr($html, 0, 1000);

if (1 === preg_match('/<meta[^>]+charset/i', $start)) {
// <meta> tag is already present, no need for modification.
return $html;
}

if (1 === preg_match('/<head[^>]*>/i', $start)) {
// <head> tag was located, <meta> tags go there.
$html = preg_replace('/<head[^>]*>/i', '$0' . $charsetTag, $html, 1);

return $html;
}

if (1 === preg_match('/<html[^>]*>/i', $start)) {
// <html> tag was located, let’s put it inside and have parser create <head>.
$html = preg_replace('/<html[^>]*>/i', '$0' . $charsetTag, $html, 1);

return $html;
}

// Fallback – just plop the <meta> at the start of the fragment.
return $charsetTag . $html;
return mb_encode_numericentity($html, $convmap, 'utf8', true);
}
}
24 changes: 20 additions & 4 deletions tests/ReadabilityTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -486,28 +486,42 @@ public function testWithWipedBody()
$this->assertStringContainsString('<a href="alice-I.html">Down the Rabbit-Hole</a>', $readability->getContent()->getInnerHtml());
}

// https://github.com/wallabag/wallabag/issues/8158
public function testCharsetAfterTitle()
{
$readability = $this->getReadability('<!DOCTYPE html><html lang="et"><head><title>Tõde ja õigus I</title> <meta charset="utf-8"></head><body><p>See oli läinud aastasaja kolmanda veerandi lõpul. Päike lähenes silmapiirile, seistes sedavõrd madalas, et enam ei ulatunud valgustama ei mäkke ronivat hobust, kes puutelgedega vankrit vedas, ei vankril istuvat noort naist ega ka ligi kolmekümnelist meest, kes kõndis vankri kõrval.</p></body></html>', 'https://et.wikisource.org/wiki/T%C3%B5de_ja_%C3%B5igus_I/I');
$readability->convertLinksToFootnotes = true;
$res = $readability->init();

$this->assertTrue($res);
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent());
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getTitle());
$this->assertSame('Tõde ja õigus I', $readability->getTitle()->getInnerHtml());
$this->assertStringContainsString('Päike lähenes', $readability->getContent()->getInnerHtml());
}

/**
* @return array<string, array{0: string, 1: string, 2?: bool}>
*/
public function dataForHtmlLang()
{
return [
'meta' => [
'<html lang="fr"><head><meta charset="utf-8"></head><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
'<html lang="fr"><head><meta charset="utf-8"></head><body><article>' . str_repeat('<p>Tous les êtres humains naissent libres et égaux en dignité et en droits. Ils sont doués de raison et de conscience et doivent agir les uns envers les autres dans un esprit de fraternité.</p>', 7) . '</article></body></html>',
'fr',
],
'head' => [
'<html lang="fr"><head><title>Foo</title></head><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
'<html lang="fr"><head><title>Foo</title></head><body><article>' . str_repeat('<p>Tous les êtres humains naissent libres et égaux en dignité et en droits. Ils sont doués de raison et de conscience et doivent agir les uns envers les autres dans un esprit de fraternité.</p>', 7) . '</article></body></html>',
'fr',
],
'headless' => [
'<html lang="fr"><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
'<html lang="fr"><body><article>' . str_repeat('<p>Tous les êtres humains naissent libres et égaux en dignité et en droits. Ils sont doués de raison et de conscience et doivent agir les uns envers les autres dans un esprit de fraternité.</p>', 7) . '</article></body></html>',
'fr',
// tidy would add <head> tag.
false,
],
'fragment' => [
'<article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article>',
'<article>' . str_repeat('<p>Tous les êtres humains naissent libres et égaux en dignité et en droits. Ils sont doués de raison et de conscience et doivent agir les uns envers les autres dans un esprit de fraternité.</p>', 7) . '</article>',
'',
// tidy would add <html>.
false,
Expand All @@ -526,6 +540,8 @@ public function testHtmlLang($html, $lang, $useTidy = true)
$this->assertTrue($res);
$this->assertInstanceOf(\DOMDocument::class, $readability->dom);
$this->assertSame($lang, $readability->dom->documentElement->getAttribute('lang'));
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent());
$this->assertStringContainsString('êtres', $readability->getContent()->getInnerHtml());
}

private function getReadability($html, $url = null, $parser = 'libxml', $useTidy = true)
Expand Down
Loading