JEMBOT MAWOT Bypass Shell

Current Path : /home/cinepatreb/billetterie/vendor/soundasleep/html2text/src/
Current File : /home/cinepatreb/billetterie/vendor/soundasleep/html2text/src/Html2Text.php
<?php
/******************************************************************************
 * Copyright (c) 2010 Jevon Wright and others.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * or
 *
 * LGPL which is available at http://www.gnu.org/licenses/lgpl.html
 *
 *
 * Contributors:
 *    Jevon Wright - initial API and implementation
 ****************************************************************************/

namespace Html2Text;

class Html2Text {

	/**
	 * Tries to convert the given HTML into a plain text format - best suited for
	 * e-mail display, etc.
	 *
	 * <p>In particular, it tries to maintain the following features:
	 * <ul>
	 *   <li>Links are maintained, with the 'href' copied over
	 *   <li>Information in the &lt;head&gt; is lost
	 * </ul>
	 *
	 * @param string $html the input HTML
	 * @param boolean $ignore_error Ignore xml parsing errors
	 * @return string the HTML converted, as best as possible, to text
	 * @throws Html2TextException if the HTML could not be loaded as a {@link DOMDocument}
	 */
	public static function convert($html, $ignore_error = false) {
		// replace &nbsp; with spaces
		$html = str_replace("&nbsp;", " ", $html);
		$html = str_replace("\xc2\xa0", " ", $html);

		$is_office_document = static::isOfficeDocument($html);

		if ($is_office_document) {
			// remove office namespace
			$html = str_replace(array("<o:p>", "</o:p>"), "", $html);
		}

		$html = static::fixNewlines($html);
		if (mb_detect_encoding($html, "UTF-8", true)) {
			$html = mb_convert_encoding($html, "HTML-ENTITIES", "UTF-8");
		}

		$doc = static::getDocument($html, $ignore_error);

		$output = static::iterateOverNode($doc, null, false, $is_office_document);

		// remove leading and trailing spaces on each line
		$output = preg_replace("/[ \t]*\n[ \t]*/im", "\n", $output);
		$output = preg_replace("/ *\t */im", "\t", $output);

		// unarmor pre blocks
		$output = str_replace("\r", "\n", $output);

		// remove unnecessary empty lines
		$output = preg_replace("/\n\n\n*/im", "\n\n", $output);

		// remove leading and trailing whitespace
		$output = trim($output);

		return $output;
	}

	/**
	 * Unify newlines; in particular, \r\n becomes \n, and
	 * then \r becomes \n. This means that all newlines (Unix, Windows, Mac)
	 * all become \ns.
	 *
	 * @param string $text text with any number of \r, \r\n and \n combinations
	 * @return string the fixed text
	 */
	static function fixNewlines($text) {
		// replace \r\n to \n
		$text = str_replace("\r\n", "\n", $text);
		// remove \rs
		$text = str_replace("\r", "\n", $text);

		return $text;
	}

	/**
	 * Parse HTML into a DOMDocument
	 *
	 * @param string $html the input HTML
	 * @param boolean $ignore_error Ignore xml parsing errors
	 * @return DOMDocument the parsed document tree
	 */
	static function getDocument($html, $ignore_error = false) {

		$doc = new \DOMDocument();

		$html = trim($html);

		if (!$html) {
			// DOMDocument doesn't support empty value and throws an error
			// Return empty document instead
			return $doc;
		}

		if ($html[0] !== '<') {
			// If HTML does not begin with a tag, we put a body tag around it.
			// If we do not do this, PHP will insert a paragraph tag around
			// the first block of text for some reason which can mess up
			// the newlines. See pre.html test for an example.
			$html = '<body>' . $html . '</body>';
		}

		if ($ignore_error) {
			$doc->strictErrorChecking = false;
			$doc->recover = true;
			$doc->xmlStandalone = true;
			$old_internal_errors = libxml_use_internal_errors(true);
			$load_result = $doc->loadHTML($html, LIBXML_NOWARNING | LIBXML_NOERROR | LIBXML_NONET);
			libxml_use_internal_errors($old_internal_errors);
		}
		else {
			$load_result = $doc->loadHTML($html);
		}

		if (!$load_result) {
			throw new Html2TextException("Could not load HTML - badly formed?", $html);
		}

		return $doc;
	}

	/**
	 * Can we guess that this HTML is generated by Microsoft Office?
	 */
	static function isOfficeDocument($html) {
		return strpos($html, "urn:schemas-microsoft-com:office") !== false;
	}

	static function isWhitespace($text) {
		return strlen(trim($text, "\n\r\t ")) === 0;
	}

	static function nextChildName($node) {
		// get the next child
		$nextNode = $node->nextSibling;
		while ($nextNode != null) {
			if ($nextNode instanceof \DOMText) {
				if (!static::isWhitespace($nextNode->wholeText)) {
					break;
				}
			}
			if ($nextNode instanceof \DOMElement) {
				break;
			}
			$nextNode = $nextNode->nextSibling;
		}
		$nextName = null;
		if (($nextNode instanceof \DOMElement || $nextNode instanceof \DOMText) && $nextNode != null) {
			$nextName = strtolower($nextNode->nodeName);
		}

		return $nextName;
	}

	static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_office_document = false) {

		if ($node instanceof \DOMText) {
		  // Replace whitespace characters with a space (equivilant to \s)
			if ($in_pre) {
				$text = "\n" . trim($node->wholeText, "\n\r\t ") . "\n";
				// Remove trailing whitespace only
				$text = preg_replace("/[ \t]*\n/im", "\n", $text);
				// armor newlines with \r.
				return str_replace("\n", "\r", $text);
			} else {
				$text = preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $node->wholeText);
				if (!static::isWhitespace($text) && ($prevName == 'p' || $prevName == 'div')) {
					return "\n" . $text;
				}
				return $text;
			}
		}
		if ($node instanceof \DOMDocumentType) {
			// ignore
			return "";
		}
		if ($node instanceof \DOMProcessingInstruction) {
			// ignore
			return "";
		}

		$name = strtolower($node->nodeName);
		$nextName = static::nextChildName($node);

		// start whitespace
		switch ($name) {
			case "hr":
				$prefix = '';
				if ($prevName != null) {
					$prefix = "\n";
				}
				return $prefix . "---------------------------------------------------------------\n";

			case "style":
			case "head":
			case "title":
			case "meta":
			case "script":
				// ignore these tags
				return "";

			case "h1":
			case "h2":
			case "h3":
			case "h4":
			case "h5":
			case "h6":
			case "ol":
			case "ul":
				// add two newlines, second line is added below
				$output = "\n";
				break;

			case "td":
			case "th":
				// add tab char to separate table fields
			   $output = "\t";
			   break;

			case "p":
				// Microsoft exchange emails often include HTML which, when passed through
				// html2text, results in lots of double line returns everywhere.
				//
				// To fix this, for any p element with a className of `MsoNormal` (the standard
				// classname in any Microsoft export or outlook for a paragraph that behaves
				// like a line return) we skip the first line returns and set the name to br.
				if ($is_office_document && $node->getAttribute('class') == 'MsoNormal') {
					$output = "";
					$name = 'br';
					break;
				}
				// add two lines
				$output = "\n\n";
				break;

			case "pre":
			case "tr":
			case "div":
				// add one line
				$output = "\n";
				break;

			case "li":
				$output = "- ";
				break;

			default:
				// print out contents of unknown tags
				$output = "";
				break;
		}

		// debug
		//$output .= "[$name,$nextName]";

		if (isset($node->childNodes)) {

			$n = $node->childNodes->item(0);
			$previousSiblingName = null;

			while($n != null) {

				$text = static::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document);

				// Pass current node name to next child, as previousSibling does not appear to get populated
				if ($n instanceof \DOMDocumentType
					|| $n instanceof \DOMProcessingInstruction
					|| ($n instanceof \DOMText && static::isWhitespace($text))) {
					// Keep current previousSiblingName, these are invisible
				}
				else {
					$previousSiblingName = strtolower($n->nodeName);
				}

				$node->removeChild($n);
				$n = $node->childNodes->item(0);

				// suppress last br tag inside a node list
				if ($n != null || $previousSiblingName != 'br') {
					$output .= $text;
				}
			}
		}

		// end whitespace
		switch ($name) {
			case "h1":
			case "h2":
			case "h3":
			case "h4":
			case "h5":
			case "h6":
				$output .= "\n";
				break;

			case "p":
				// add two lines
				$output .= "\n\n";
				break;

			case "pre":
			case "br":
				// add one line
				$output .= "\n";
				break;

			case "div":
				break;

			case "a":
				// links are returned in [text](link) format
				$href = $node->getAttribute("href");

				$output = trim($output);

				// remove double [[ ]] s from linking images
				if (substr($output, 0, 1) == "[" && substr($output, -1) == "]") {
					$output = substr($output, 1, strlen($output) - 2);

					// for linking images, the title of the <a> overrides the title of the <img>
					if ($node->getAttribute("title")) {
						$output = $node->getAttribute("title");
					}
				}

				// if there is no link text, but a title attr
				if (!$output && $node->getAttribute("title")) {
					$output = $node->getAttribute("title");
				}

				if ($href == null) {
					// it doesn't link anywhere
					if ($node->getAttribute("name") != null) {
						$output = "[$output]";
					}
				} else {
					if ($href == $output || $href == "mailto:$output" || $href == "http://$output" || $href == "https://$output") {
						// link to the same address: just use link
						$output;
					} else {
						// replace it
						if ($output) {
							$output = "[$output]($href)";
						} else {
							// empty string
							$output = $href;
						}
					}
				}

				// does the next node require additional whitespace?
				switch ($nextName) {
					case "h1": case "h2": case "h3": case "h4": case "h5": case "h6":
						$output .= "\n";
						break;
				}
				break;

			case "img":
				if ($node->getAttribute("title")) {
					$output = "[" . $node->getAttribute("title") . "]";
				} elseif ($node->getAttribute("alt")) {
					$output = "[" . $node->getAttribute("alt") . "]";
				} else {
					$output = "";
				}
				break;

			case "li":
				$output .= "\n";
				break;

			default:
				// do nothing
		}

		return $output;
	}
}
xxxxx1.0, XXX xxxx