Skip to content

Commit

Permalink
Porting improvements from html2text_ruby to html2text
Browse files Browse the repository at this point in the history
- Adding support for image text
- Adding better support for links with titles, blank text
- Remove nonbreaking characters that aren't directly entities
- Clean up spacing around tabs generated from tables
  • Loading branch information
soundasleep committed Dec 18, 2015
1 parent 49d7e0d commit 574b93d
Show file tree
Hide file tree
Showing 9 changed files with 416 additions and 60 deletions.
45 changes: 4 additions & 41 deletions .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -12,45 +12,8 @@ trim_trailing_whitespace = true
indent_style = tab
indent_size = 4

# 2 space indentation
[**.coffee]
indent_style = space
indent_size = 2

[**.js]
indent_style = space
indent_size = 2

[**.json]
indent_style = space
indent_size = 2

[**.css]
indent_style = space
indent_size = 2

[**.scss]
indent_style = space
indent_size = 2

[**.yml]
indent_style = space
indent_size = 2

[**.js]
indent_style = space
indent_size = 2

[**.txt]
indent_style = space
indent_size = 2

[**.md]
indent_style = space
indent_size = 2

[**.sql]
# very important; mysql client will interpret \t as tab character
indent_style = space
indent_size = 2
[spec/examples/*]
indent_style = tabs
trim_trailing_whitespace = false
insert_final_newline = false

50 changes: 40 additions & 10 deletions src/Html2Text.php
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ class Html2Text {
static function convert($html) {
// replace   with spaces
$html = str_replace(" ", " ", $html);
$html = str_replace("\xa0", " ", $html);

$html = static::fixNewlines($html);

Expand All @@ -48,6 +49,10 @@ static function convert($html) {

// remove leading and trailing spaces on each line
$output = preg_replace("/[ \t]*\n[ \t]*/im", "\n", $output);
$output = preg_replace("/ *\t */im", "\t", $output);

// remove unnecessary empty lines
$output = preg_replace("/\n\n\n*/im", "\n\n", $output);

// remove leading and trailing whitespace
$output = trim($output);
Expand Down Expand Up @@ -124,7 +129,7 @@ static function iterateOverNode($node) {
// start whitespace
switch ($name) {
case "hr":
return "------\n";
return "---------------------------------------------------------------\n";

case "style":
case "head":
Expand Down Expand Up @@ -184,14 +189,6 @@ static function iterateOverNode($node) {

// end whitespace
switch ($name) {
case "style":
case "head":
case "title":
case "meta":
case "script":
// ignore these tags
return "";

case "h1":
case "h2":
case "h3":
Expand All @@ -217,6 +214,24 @@ static function iterateOverNode($node) {
case "a":
// links are returned in [text](link) format
$href = $node->getAttribute("href");

$output = trim($output);

// remove double [[ ]] s from linking images
if (substr($output, 0, 1) == "[" && substr($output, -1) == "]") {
$output = substr($output, 1, strlen($output) - 2);

// for linking images, the title of the <a> overrides the title of the <img>
if ($node->getAttribute("title")) {
$output = $node->getAttribute("title");
}
}

// if there is no link text, but a title attr
if (!$output && $node->getAttribute("title")) {
$output = $node->getAttribute("title");
}

if ($href == null) {
// it doesn't link anywhere
if ($node->getAttribute("name") != null) {
Expand All @@ -228,7 +243,12 @@ static function iterateOverNode($node) {
$output;
} else {
// replace it
$output = "[$output]($href)";
if ($output) {
$output = "[$output]($href)";
} else {
// empty string
$output = $href;
}
}
}

Expand All @@ -240,6 +260,16 @@ static function iterateOverNode($node) {
}
break;

case "img":
if ($node->getAttribute("title")) {
$output = "[" . $node->getAttribute("title") . "]";
} elseif ($node->getAttribute("alt")) {
$output = "[" . $node->getAttribute("alt") . "]";
} else {
$output = "";
}
break;

case "li":
$output .= "\n";
break;
Expand Down
8 changes: 8 additions & 0 deletions tests/Html2TextTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,12 @@ function testLists() {
$this->doTest("lists");
}

function testFullEmail() {
$this->doTest("full_email");
}

function testImages() {
$this->doTest("images");
}

}
8 changes: 4 additions & 4 deletions tests/anchors.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
A document without any HTML open/closing tags.
------
We try and use the representation given by common browsers of the HTML document, so that it looks similar when converted to plain text. [visit foo.com](http://foo.com) - or http://www.foo.com [link](http://foo.com)
A document without any HTML open/closing tags.
---------------------------------------------------------------
We try and use the representation given by common browsers of the HTML document, so that it looks similar when converted to plain text. [visit foo.com](http://foo.com) - or http://www.foo.com [link](http://foo.com)

[An anchor which will not appear]
220 changes: 220 additions & 0 deletions tests/full_email.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<meta name="viewport" content="width=680">
</head>
<body class="cat-update-email cat-update" style="background: #ffccee; color: blue; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 0; text-align: center" bgcolor="#ffccee">
<style type="text/css">
body.cat-update-email {
margin: 0; padding: 0; background: #ffccee; color: blue; text-align: center;
}
body.cat-update-email {
font-size: 12px; font-family: Times New Roman; font-weight: normal;
}
body.cat-update-email th {
font-size: 12px; font-family: Times New Roman; font-weight: normal;
}
body.cat-update-email td {
font-size: 12px; font-family: Times New Roman; font-weight: normal;
}
</style>
<table class="header-wrapper" style="border-spacing: 0; border: none; margin: 0; width: 100%">
<tr>
<td class="header" style="background: none; color: #999; font-family: Times New Roman; font-size: 12px; font-weight: normal; padding: 15px 0">
<table cellspacing="0" cellpadding="0" border="0" style="margin: 0 auto; padding: 0 20px; width: 640px">
<tr>
<th style="font-family: Times New Roman; font-size: 12px; font-weight: normal">
<a class="logo" href="http://localhost/home" style="color: red; text-decoration: none">
<img border="0" height="32" src="test.png" width="200" style="display: block">
</a> </th>
<td class="account-number" style="color: white; font-family: Times New Roman; font-size: 12px; font-weight: normal; text-align: right" align="right">
16 December 2015<br>
Account 123
</td>
</tr>
</table>
</td>
</tr>
</table>

<table class="section-wrapper" style="border-spacing: 0; border: none; margin: 0 auto 20px; width: 640px">
<tr>
<td class="salutation section" style="background: white; color: black; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0 auto 20px; padding: 40px 20px; text-align: left; width: 600px" align="left" bgcolor="white">
<h1 class="user_greeting" style="font-family: Times New Roman; font-size: 1.8; font-weight: normal; line-height: 1.2; margin: 0 0 1em">
Hi Susan
</h1>
<p class="message" style="font-size: 1.5em; line-height: 1.2; margin: 0">
Here is your cat report.
</p>

</td>
</tr>
</table>




<table class="section-wrapper" style="border-spacing: 0; border: none; margin: 0 auto 20px; width: 640px">
<tr>
<td class="balance section" style="background: white; color: black; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0 auto 20px; padding: 40px 20px; text-align: left; width: 600px" align="left" bgcolor="white">
<div class="account-status-heading" style="font-size: 2.5em; line-height: 1em; padding: 30px 20px; text-align: center" align="center">You have found <span class="status-cats-negative" style="color: #df0000">5 cats</span> less than anyone else</div>

<div id="cat-update-action-buttons">
<div id="buy-button" style="text-align: center" align="center">
<a class="btn-alert" href="http://localhost/cats" id="buy-cats-button" style="-moz-appearance: none; -webkit-appearance: none; background: #DF0000; border-radius: 3px; border: 11px solid #df0000; color: #fff; cursor: pointer; display: block; font-size: 16px; height: 16px; line-height: 16px; margin: 0 auto; text-decoration: none; transition: background-color .15s; width: 120px">Find more cats</a>
</div>
</div>
</td>
</tr>
</table>

<table class="section-wrapper" style="border-spacing: 0; border: none; margin: 0 auto 20px; width: 640px">
<tr>
<td class="cats section" id="cats" style="background: white; color: black; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0 auto 20px; padding: 40px 20px; text-align: left; width: 600px" align="left" bgcolor="white">
<div class="cats-usage">
<h2 style="font-family: Times New Roman; font-size: 1.8; font-weight: normal; line-height: 1.2; margin: 0">Down the road</h2>
<p class="fine-print" style="margin: 0">Across the hall</p>

<h3 style="font-family: Times New Roman; font-size: 18px; font-weight: normal; line-height: 2em; margin: 10px 0 0">Your achievements</h3>
<table class="current-usage with-icon-left" style="border-collapse: collapse; border-spacing: 0; margin-bottom: 20px; margin-top: 20px; width: 100%">
<tr>
<th style="border: none; font-family: Times New Roman; font-size: 14px; font-weight: bold; margin: 0; padding: 0; text-align: left; vertical-align: middle; width: 50px" align="left" valign="middle"><img src="test.png"></th>
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 0; vertical-align: top; width: 550px" valign="top">
<div class="top">You're currently finding about</div>
<div class="large" style="color: black; font-size: 18px; padding: 4px 0">12 cats</div>
<div class="bottom">per day</div>
</td>
</tr>
<tr><td colspan="2" style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 0; vertical-align: top; width: 550px" valign="top"> </td></tr>
<tr>
<td colspan="2" style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 0; vertical-align: top; width: 550px" valign="top"><img alt="Number of cats found" src="test.png"></td>
</tr>
</table>
</div>


<div class="summary">
<hr class="fine-print" style="border-bottom-color: #eee; border-bottom-style: solid; border-width: 0 0 1px; margin: 20px 0">

<h3 style="font-family: Times New Roman; font-size: 18px; font-weight: normal; line-height: 2em; margin: 10px 0 0">Your last cat was found two days ago.</h3>
<p class="fine-print" style="margin: 0">One type of cat is a kitten.</p>

<table class="readings" style="border-collapse: collapse; border-spacing: 0; margin: 10px 0; width: 100%">
<tr style="color: #BD236C">
<td class="left-column" style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 0; width: 5%">
<img src="test.png" style="padding-top: 10px">
</td>
<td class="center-column" style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 0; width: 60%">
<h3 style="font-family: Times New Roman; font-size: 18px; font-weight: normal; line-height: 2em; margin: 10px 0 0">Special account <span class="nickname" style="font-size: 12px"></span> <span class="fine-print">A1</span>
</h3>
</td>
<td class="right-column" style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 0; width: 20%">
<h3 style="font-family: Times New Roman; font-size: 18px; font-weight: normal; line-height: 2em; margin: 10px 0 0">12.345</h3>
</td>
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 0"></td>
</tr>
</table>

</div>

</td>
</tr>
</table>

<div class="banner" style="margin: 0 auto 20px; padding: 10px; text-align: center; width: 640px" align="center">
<a href="http://localhost/logout" style="color: red; text-decoration: none">
<img alt="" border="0" height="177" src="http://localhost/photo1.png" width="600">
</a>
</div>

<table class="section-wrapper" style="border-spacing: 0; border: none; margin: 0 auto 20px; width: 640px">
<tr>
<td class="tips section" style="background: white; color: black; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0 auto 20px; padding: 40px 20px; text-align: left; width: 600px" align="left" bgcolor="white">
<table style="border-collapse: collapse; border-spacing: 0; width: 100%">
<tr>
<td colspan="3" style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0; vertical-align: top; width: 200px" valign="top"><h2 style="font-family: Times New Roman; font-size: 1.8; font-weight: normal; line-height: 1.2; margin: 0 0 10px">How can you find more cats?</h2></td>
</tr>

<tr class="icon">
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0; vertical-align: top; width: 200px" valign="top"><img height="40" src="http://localhost/photo1.png" width="40"></td>
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0 17px; vertical-align: top; width: 200px" valign="top"><img height="40" src="http://localhost/photo2.png" width="40"></td>
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0 17px; vertical-align: top; width: 200px" valign="top"><img height="40" src="http://localhost/photo3.png" width="40"></td>
</tr>

<tr class="subtitle">
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0; vertical-align: top; width: 200px" valign="top"><h3 style="font-family: Times New Roman; font-size: 18px; font-weight: normal; line-height: 2em; margin: 0 0 5px">Look in trash cans</h3></td>
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0 17px; vertical-align: top; width: 200px" valign="top"><h3 style="font-family: Times New Roman; font-size: 18px; font-weight: normal; line-height: 2em; margin: 0 0 5px">Start meowing</h3></td>
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0 17px; vertical-align: top; width: 200px" valign="top"><h3 style="font-family: Times New Roman; font-size: 18px; font-weight: normal; line-height: 2em; margin: 0 0 5px">Eat cat food</h3></td>
</tr>

<tr class="body" style="color: green">
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0; vertical-align: top; width: 200px" valign="top">Some cats like to hang out in trash cans. Some cats do not.</td>
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0 17px; vertical-align: top; width: 200px" valign="top">Some cats are attracted to similar tones.</td>
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0 17px; vertical-align: top; width: 200px" valign="top">So one day your tears may smell like cat food, attracting more cats.</td>
</tr>

<tr class="image">
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0; vertical-align: top; width: 200px" valign="top">
<a href="https://localhost/about" style="color: red; text-decoration: none">
<img border="0" height="130" src="http://localhost/photo1.png" style="display: block; margin: 10px 0" width="165">
</a>
</td>
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0 17px; vertical-align: top; width: 200px" valign="top">
<a href="https://localhost/about" style="color: red; text-decoration: none">
<img border="0" height="130" src="http://localhost/photo2.png" style="display: block; margin: 10px 0" width="165">
</a>
</td>
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0 17px; vertical-align: top; width: 200px" valign="top">
<a href="https://localhost/about" style="color: red; text-decoration: none">
<img border="0" height="130" src="http://localhost/photo3.png" style="display: block; margin: 10px 0" width="165">
</a>
</td>
</tr>

<tr class="tips-footer" style="color: green">
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0; vertical-align: top; width: 200px" valign="top">
<a href="https://github.com/soundasleep/html2text_ruby" style="color: red; text-decoration: none">Cats are great.</a>
</td>
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0 17px; vertical-align: top; width: 200px" valign="top">
<a href="https://github.com/soundasleep/html2text_ruby" style="color: red; text-decoration: none">Find more cats.</a>
</td>
<td style="border: none; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0; padding: 5px 0 0 17px; vertical-align: top; width: 200px" valign="top">
<a href="https://github.com/soundasleep/html2text_ruby" style="color: red; text-decoration: none">Do more things.</a>
</td>
</tr>
</table>
</td>
</tr>
</table>





<table class="footer-wrapper" style="margin: 0 auto 20px">
<tr>
<td class="footer" style="color: #9B9B9B; font-family: Times New Roman; font-size: 12px; font-weight: normal; margin: 0 auto 4em; text-align: left; width: 600px" align="left">
<h3 style="font-family: Times New Roman; font-size: 1.2; font-weight: normal; line-height: 2em; margin: 0">
<a href="http://localhost/contact" style="color: red; text-decoration: none">Contact us</a>
</h3>
<p style="margin: 0 0 1em">
[email protected]<br>
Monday and Friday
</p>

<p style="margin: 0 0 1em"><a href="https://github.com/soundasleep/html2text" style="color: red; text-decoration: none"><img align="absmiddle" height="26" src="test.png" width="26"></a>
<a href="https://github.com/soundasleep/html2text_ruby" style="color: red; text-decoration: none"><img align="absmiddle" height="26" src="test.png" width="26"></a>
</p>

<p class="message no-web-display" style="margin: 0">Having trouble seeing this email?
<a href="http://localhost/view_it_online" style="color: red; text-decoration: none">View it online</a>.
</p>

</td>
</tr>
</table>
<script async type="text/javascript" id="profiler" src="/profiler.js" data-version="1.0"></script>
</body>
</html>

Loading

0 comments on commit 574b93d

Please sign in to comment.