Class Markdown_Parser

Markdown_Parser

Properties

public $block_gamut

Default value:
array(5) (
    "doHeaders" => integer 10
    "doHorizontalRules" => integer 20
    "doLists" => integer 40
    "doCodeBlocks" => integer 50
    "doBlockQuotes" => integer 60
)

public $document_gamut

Default value:
array(2) (
    "stripLinkDefinitions" => integer 20
    "runBasicBlockGamut" => integer 30
)

public $em_relist

Default value:
array(3) (
    "" => string(50) "(?:(?<!\*)\*(?!\*)|(?<!_)_(?!_))(?=\S)(?![.,:;]\s)"
    "*" => string(22) "(?<=\S)(?<!\*)\*(?!\*)"
    "_" => string(19) "(?<=\S)(?<!_)_(?!_)"
)

public $em_strong_prepared_relist

Default value:
NULL

public $em_strong_relist

Default value:
array(3) (
    "" => string(56) "(?:(?<!\*)\*\*\*(?!\*)|(?<!_)___(?!_))(?=\S)(?![.,:;]\s)"
    "***" => string(26) "(?<=\S)(?<!\*)\*\*\*(?!\*)"
    "___" => string(21) "(?<=\S)(?<!_)___(?!_)"
)

public $empty_element_suffix

Default value:
string(3) " />"

public $escape_chars

Default value:
string(16) "\`*_{}[]()>#+-.!"

public $escape_chars_re

Default value:
NULL

public $html_hashes

Default value:
array(0) 

public $in_anchor

Default value:
bool FALSE

public $list_level

Default value:
integer 0

public $nested_brackets_depth

Default value:
integer 6

public $nested_brackets_re

Default value:
NULL

public $nested_url_parenthesis_depth

Default value:
integer 4

public $nested_url_parenthesis_re

Default value:
NULL

public $no_entities

Default value:
bool FALSE

public $no_markup

Default value:
bool FALSE

public $predef_titles

Default value:
array(0) 

public $predef_urls

Default value:
array(0) 

public $span_gamut

Default value:
array(7) (
    "parseSpan" => integer -30
    "doImages" => integer 10
    "doAnchors" => integer 20
    "doAutoLinks" => integer 30
    "encodeAmpsAndAngles" => integer 40
    "doItalicsAndBold" => integer 50
    "doHardBreaks" => integer 60
)

public $strong_relist

Default value:
array(3) (
    "" => string(53) "(?:(?<!\*)\*\*(?!\*)|(?<!_)__(?!_))(?=\S)(?![.,:;]\s)"
    "**" => string(24) "(?<=\S)(?<!\*)\*\*(?!\*)"
    "__" => string(20) "(?<=\S)(?<!_)__(?!_)"
)

public $tab_width

Default value:
integer 4

public $titles

Default value:
array(0) 

public $urls

Default value:
array(0) 

public $utf8_strlen

Default value:
string(9) "mb_strlen"

Methods

public __construct() (defined in Markdown_Parser)

Source Code

function __construct() {
#
# Constructor function. Initialize appropriate member variables.
#
	$this->_initDetab();
	$this->prepareItalicsAndBold();

	$this->nested_brackets_re = 
		str_repeat('(?>[^\[\]]+|\[', $this->nested_brackets_depth).
		str_repeat('\])*', $this->nested_brackets_depth);

	$this->nested_url_parenthesis_re = 
		str_repeat('(?>[^()\s]+|\(', $this->nested_url_parenthesis_depth).
		str_repeat('(?>\)))*', $this->nested_url_parenthesis_depth);
	
	$this->escape_chars_re = '['.preg_quote($this->escape_chars).']';
	
	# Sort document, block, and span gamut in ascendent priority order.
	asort($this->document_gamut);
	asort($this->block_gamut);
	asort($this->span_gamut);
}

public _detab_callback() (defined in Markdown_Parser)

Source Code

function _detab_callback($matches) {
	$line = $matches[0];
	$strlen = $this->utf8_strlen; # strlen function for UTF-8.
	
	# Split in blocks.
	$blocks = explode("\t", $line);
	# Add each blocks to the line.
	$line = $blocks[0];
	unset($blocks[0]); # Do not add first block twice.
	foreach ($blocks as $block) {
		# Calculate amount of space, insert spaces, insert block.
		$amount = $this->tab_width - 
			$strlen($line, 'UTF-8') % $this->tab_width;
		$line .= str_repeat(" ", $amount) . $block;
	}
	return $line;
}

public _doAnchors_inline_callback() (defined in Markdown_Parser)

Source Code

function _doAnchors_inline_callback($matches) {
	$whole_match	=  $matches[1];
	$link_text		=  $this->runSpanGamut($matches[2]);
	$url			=  $matches[3] == '' ? $matches[4] : $matches[3];
	$title			=& $matches[7];

	$url = $this->encodeAttribute($url);

	$result = "<a href=\"$url\"";
	if (isset($title)) {
		$title = $this->encodeAttribute($title);
		$result .=  " title=\"$title\"";
	}
	
	$link_text = $this->runSpanGamut($link_text);
	$result .= ">$link_text</a>";

	return $this->hashPart($result);
}

public _doAnchors_reference_callback() (defined in Markdown_Parser)

Source Code

function _doAnchors_reference_callback($matches) {
	$whole_match =  $matches[1];
	$link_text   =  $matches[2];
	$link_id     =& $matches[3];

	if ($link_id == "") {
		# for shortcut links like [this][] or [this].
		$link_id = $link_text;
	}
	
	# lower-case and turn embedded newlines into spaces
	$link_id = strtolower($link_id);
	$link_id = preg_replace('{[ ]?\n}', ' ', $link_id);

	if (isset($this->urls[$link_id])) {
		$url = $this->urls[$link_id];
		$url = $this->encodeAttribute($url);
		
		$result = "<a href=\"$url\"";
		if ( isset( $this->titles[$link_id] ) ) {
			$title = $this->titles[$link_id];
			$title = $this->encodeAttribute($title);
			$result .=  " title=\"$title\"";
		}
	
		$link_text = $this->runSpanGamut($link_text);
		$result .= ">$link_text</a>";
		$result = $this->hashPart($result);
	}
	else {
		$result = $whole_match;
	}
	return $result;
}

Source Code

function _doAutoLinks_email_callback($matches) {
	$address = $matches[1];
	$link = $this->encodeEmailAddress($address);
	return $this->hashPart($link);
}

Source Code

function _doAutoLinks_url_callback($matches) {
	$url = $this->encodeAttribute($matches[1]);
	$link = "<a href=\"$url\">$url</a>";
	return $this->hashPart($link);
}

public _doBlockQuotes_callback() (defined in Markdown_Parser)

Source Code

function _doBlockQuotes_callback($matches) {
	$bq = $matches[1];
	# trim one level of quoting - trim whitespace-only lines
	$bq = preg_replace('/^[ ]*>[ ]?|^[ ]+$/m', '', $bq);
	$bq = $this->runBlockGamut($bq);		# recurse

	$bq = preg_replace('/^/m', "  ", $bq);
	# These leading spaces cause problem with <pre> content, 
	# so we need to fix that:
	$bq = preg_replace_callback('{(\s*<pre>.+?</pre>)}sx', 
		[&$this, '_DoBlockQuotes_callback2'], $bq);

	return "\n". $this->hashBlock("<blockquote>\n$bq\n</blockquote>")."\n\n";
}

public _doBlockQuotes_callback2() (defined in Markdown_Parser)

Source Code

function _doBlockQuotes_callback2($matches) {
	$pre = $matches[1];
	$pre = preg_replace('/^  /m', '', $pre);
	return $pre;
}

public _doCodeBlocks_callback() (defined in Markdown_Parser)

Source Code

function _doCodeBlocks_callback($matches) {
	$codeblock = $matches[1];

	$codeblock = $this->outdent($codeblock);
	$codeblock = htmlspecialchars($codeblock, ENT_NOQUOTES);

	# trim leading newlines and trailing newlines
	$codeblock = preg_replace('/\A\n+|\n+\z/', '', $codeblock);

	$codeblock = "<pre><code>$codeblock\n</code></pre>";
	return "\n\n".$this->hashBlock($codeblock)."\n\n";
}

public _doHardBreaks_callback() (defined in Markdown_Parser)

Source Code

function _doHardBreaks_callback($matches) {
	return $this->hashPart("<br$this->empty_element_suffix\n");
}

public _doHeaders_callback_atx() (defined in Markdown_Parser)

Source Code

function _doHeaders_callback_atx($matches) {
	$level = strlen($matches[1]);
	$block = "<h$level>".$this->runSpanGamut($matches[2])."</h$level>";
	return "\n" . $this->hashBlock($block) . "\n\n";
}

public _doHeaders_callback_setext() (defined in Markdown_Parser)

Source Code

function _doHeaders_callback_setext($matches) {
	# Terrible hack to check we haven't found an empty list item.
	if ($matches[2] == '-' && preg_match('{^-(?: |$)}', $matches[1]))
		return $matches[0];
	
	$level = $matches[2]{0} == '=' ? 1 : 2;
	$block = "<h$level>".$this->runSpanGamut($matches[1])."</h$level>";
	return "\n" . $this->hashBlock($block) . "\n\n";
}

public _doImages_inline_callback() (defined in Markdown_Parser)

Source Code

function _doImages_inline_callback($matches) {
	$whole_match	= $matches[1];
	$alt_text		= $matches[2];
	$url			= $matches[3] == '' ? $matches[4] : $matches[3];
	$title			=& $matches[7];

	$alt_text = $this->encodeAttribute($alt_text);
	$url = $this->encodeAttribute($url);
	$result = "<img src=\"$url\" alt=\"$alt_text\"";
	if (isset($title)) {
		$title = $this->encodeAttribute($title);
		$result .=  " title=\"$title\""; # $title already quoted
	}
	$result .= $this->empty_element_suffix;

	return $this->hashPart($result);
}

public _doImages_reference_callback() (defined in Markdown_Parser)

Source Code

function _doImages_reference_callback($matches) {
	$whole_match = $matches[1];
	$alt_text    = $matches[2];
	$link_id     = strtolower($matches[3]);

	if ($link_id == "") {
		$link_id = strtolower($alt_text); # for shortcut links like ![this][].
	}

	$alt_text = $this->encodeAttribute($alt_text);
	if (isset($this->urls[$link_id])) {
		$url = $this->encodeAttribute($this->urls[$link_id]);
		$result = "<img src=\"$url\" alt=\"$alt_text\"";
		if (isset($this->titles[$link_id])) {
			$title = $this->titles[$link_id];
			$title = $this->encodeAttribute($title);
			$result .=  " title=\"$title\"";
		}
		$result .= $this->empty_element_suffix;
		$result = $this->hashPart($result);
	}
	else {
		# If there's no such link ID, leave intact:
		$result = $whole_match;
	}

	return $result;
}

public _doLists_callback() (defined in Markdown_Parser)

Source Code

function _doLists_callback($matches) {
	# Re-usable patterns to match list item bullets and number markers:
	$marker_ul_re  = '[*+-]';
	$marker_ol_re  = '\d+[.]';
	$marker_any_re = "(?:$marker_ul_re|$marker_ol_re)";
	
	$list = $matches[1];
	$list_type = preg_match("/$marker_ul_re/", $matches[3]) ? "ul" : "ol";
	
	$marker_any_re = ( $list_type == "ul" ? $marker_ul_re : $marker_ol_re );
	
	$list .= "\n";
	$result = $this->processListItems($list, $marker_any_re);
	
	$result = $this->hashBlock("<$list_type>\n" . $result . "</$list_type>");
	return "\n". $result ."\n\n";
}

public _hashHTMLBlocks_callback() (defined in Markdown_Parser)

Source Code

function _hashHTMLBlocks_callback($matches) {
	$text = $matches[1];
	$key  = $this->hashBlock($text);
	return "\n\n$key\n\n";
}

public _initDetab() (defined in Markdown_Parser)

Source Code

function _initDetab() {
#
# Check for the availability of the function in the `utf8_strlen` property
# (initially `mb_strlen`). If the function is not available, create a 
# function that will loosely count the number of UTF-8 characters with a
# regular expression.
#
	if (function_exists($this->utf8_strlen)) return;
	$this->utf8_strlen = function($text) {
		return preg_match_all('/[\x00-\xBF]|[\xC0-\xFF][\x80-\xBF]*/', $text, $m);
	};
}

public _processListItems_callback() (defined in Markdown_Parser)

Source Code

function _processListItems_callback($matches) {
	$item = $matches[4];
	$leading_line =& $matches[1];
	$leading_space =& $matches[2];
	$marker_space = $matches[3];
	$tailing_blank_line =& $matches[5];

	if ($leading_line || $tailing_blank_line || 
		preg_match('/\n{2,}/', $item))
	{
		# Replace marker with the appropriate whitespace indentation
		$item = $leading_space . str_repeat(' ', strlen($marker_space)) . $item;
		$item = $this->runBlockGamut($this->outdent($item)."\n");
	}
	else {
		# Recursion for sub-lists:
		$item = $this->doLists($this->outdent($item));
		$item = preg_replace('/\n+$/', '', $item);
		$item = $this->runSpanGamut($item);
	}

	return "<li>" . $item . "</li>\n";
}

public _stripLinkDefinitions_callback() (defined in Markdown_Parser)

Source Code

function _stripLinkDefinitions_callback($matches) {
	$link_id = strtolower($matches[1]);
	$this->urls[$link_id] = $matches[2];
	$this->titles[$link_id] =& $matches[3];
	return ''; # String that will replace the block
}

public _unhash_callback() (defined in Markdown_Parser)

Source Code

function _unhash_callback($matches) {
	return $this->html_hashes[$matches[0]];
}

public detab() (defined in Markdown_Parser)

Source Code

function detab($text) {
#
# Replace tabs with the appropriate amount of space.
#
	# For each line we separate the line in blocks delemited by
	# tab characters. Then we reconstruct every line by adding the 
	# appropriate number of space between each blocks.
	
	$text = preg_replace_callback('/^.*\t.*$/m',
		[&$this, '_detab_callback'], $text);

	return $text;
}

public doAnchors() (defined in Markdown_Parser)

Source Code

function doAnchors($text) {
#
# Turn Markdown link shortcuts into XHTML <a> tags.
#
	if ($this->in_anchor) return $text;
	$this->in_anchor = true;
	
	#
	# First, handle reference-style links: [link text] [id]
	#
	$text = preg_replace_callback('{
		(					# wrap whole match in $1
		  \[
			('.$this->nested_brackets_re.')	# link text = $2
		  \]

		  [ ]?				# one optional space
		  (?:\n[ ]*)?		# one optional newline followed by spaces

		  \[
			(.*?)		# id = $3
		  \]
		)
		}xs',
		[&$this, '_doAnchors_reference_callback'], $text);

	#
	# Next, inline-style links: [link text](url "optional title")
	#
	$text = preg_replace_callback('{
		(				# wrap whole match in $1
		  \[
			('.$this->nested_brackets_re.')	# link text = $2
		  \]
		  \(			# literal paren
			[ ]*
			(?:
				<(\S*)>	# href = $3
			|
				('.$this->nested_url_parenthesis_re.')	# href = $4
			)
			[ ]*
			(			# $5
			  ([\'"])	# quote char = $6
			  (.*?)		# Title = $7
			  \6		# matching quote
			  [ ]*	# ignore any spaces/tabs between closing quote and )
			)?			# title is optional
		  \)
		)
		}xs',
		[&$this, '_DoAnchors_inline_callback'], $text);

	#
	# Last, handle reference-style shortcuts: [link text]
	# These must come last in case you've also got [link test][1]
	# or [link test](/foo)
	#
/		$text = preg_replace_callback('{
/			(					# wrap whole match in $1
/			  \[
/				([^\[\]]+)		# link text = $2; can\'t contain [ or ]
/			  \]
/			)
/			}xs',
/			array(&$this, '_doAnchors_reference_callback'), $text);

	$this->in_anchor = false;
	return $text;
}

Source Code

function doAutoLinks($text) {
	$text = preg_replace_callback('{<((https?|ftp|dict):[^\'">\s]+)>}i', 
		[&$this, '_doAutoLinks_url_callback'], $text);

	# Email addresses: <[email protected]>
	$text = preg_replace_callback('{
		<
		(?:mailto:)?
		(
			[-.\w\x80-\xFF]+
			\@
			[-a-z0-9\x80-\xFF]+(\.[-a-z0-9\x80-\xFF]+)*\.[a-z]+
		)
		>
		}xi',
		[&$this, '_doAutoLinks_email_callback'], $text);

	return $text;
}

public doBlockQuotes() (defined in Markdown_Parser)

Source Code

function doBlockQuotes($text) {
	$text = preg_replace_callback('/
		  (								# Wrap whole match in $1
			(?>
			  ^[ ]*>[ ]?			# ">" at the start of a line
				.+\n					# rest of the first line
			  (.+\n)*					# subsequent consecutive lines
			  \n*						# blanks
			)+
		  )
		/xm',
		[&$this, '_doBlockQuotes_callback'], $text);

	return $text;
}

public doCodeBlocks() (defined in Markdown_Parser)

Source Code

function doCodeBlocks($text) {
#
#	Process Markdown `<pre><code>` blocks.
#
	$text = preg_replace_callback('{
			(?:\n\n|\A\n?)
			(	            # $1 = the code block -- one or more lines, starting with a space/tab
			  (?>
				[ ]{'.$this->tab_width.'}  # Lines must start with a tab or a tab-width of spaces
				.*\n+
			  )+
			)
			((?=^[ ]{0,'.$this->tab_width.'}\S)|\Z)	# Lookahead for non-space at line-start, or end of doc
		}xm',
		[&$this, '_doCodeBlocks_callback'], $text);

	return $text;
}

public doHardBreaks() (defined in Markdown_Parser)

Source Code

function doHardBreaks($text) {
	# Do hard breaks:
	return preg_replace_callback('/ {2,}\n/', 
		[&$this, '_doHardBreaks_callback'], $text);
}

public doHeaders() (defined in Markdown_Parser)

Source Code

function doHeaders($text) {
	# Setext-style headers:
	#	  Header 1
	#	  ========
	#  
	#	  Header 2
	#	  --------
	#
	$text = preg_replace_callback('{ ^(.+?)[ ]*\n(=+|-+)[ ]*\n+ }mx',
		[&$this, '_doHeaders_callback_setext'], $text);

	# atx-style headers:
	#	# Header 1
	#	## Header 2
	#	## Header 2 with closing hashes ##
	#	...
	#	###### Header 6
	#
	$text = preg_replace_callback('{
			^(\#{1,6})	# $1 = string of #\'s
			[ ]*
			(.+?)		# $2 = Header text
			[ ]*
			\#*			# optional closing #\'s (not counted)
			\n+
		}xm',
		[&$this, '_doHeaders_callback_atx'], $text);

	return $text;
}

public doHorizontalRules() (defined in Markdown_Parser)

Source Code

function doHorizontalRules($text) {
	# Do Horizontal Rules:
	return preg_replace(
		'{
			^[ ]{0,3}	# Leading space
			([-*_])		# $1: First marker
			(?>			# Repeated marker group
				[ ]{0,2}	# Zero, one, or two spaces.
				\1			# Marker character
			){2,}		# Group repeated at least twice
			[ ]*		# Tailing spaces
			$			# End of line.
		}mx',
		"\n".$this->hashBlock("<hr$this->empty_element_suffix")."\n", 
		$text);
}

public doImages() (defined in Markdown_Parser)

Source Code

function doImages($text) {
#
# Turn Markdown image shortcuts into <img> tags.
#
	#
	# First, handle reference-style labeled images: ![alt text][id]
	#
	$text = preg_replace_callback('{
		(				# wrap whole match in $1
		  !\[
			('.$this->nested_brackets_re.')		# alt text = $2
		  \]

		  [ ]?				# one optional space
		  (?:\n[ ]*)?		# one optional newline followed by spaces

		  \[
			(.*?)		# id = $3
		  \]

		)
		}xs', 
		[&$this, '_doImages_reference_callback'], $text);

	#
	# Next, handle inline images:  ![alt text](url "optional title")
	# Don't forget: encode * and _
	#
	$text = preg_replace_callback('{
		(				# wrap whole match in $1
		  !\[
			('.$this->nested_brackets_re.')		# alt text = $2
		  \]
		  \s?			# One optional whitespace character
		  \(			# literal paren
			[ ]*
			(?:
				<(\S*)>	# src url = $3
			|
				('.$this->nested_url_parenthesis_re.')	# src url = $4
			)
			[ ]*
			(			# $5
			  ([\'"])	# quote char = $6
			  (.*?)		# title = $7
			  \6		# matching quote
			  [ ]*
			)?			# title is optional
		  \)
		)
		}xs',
		[&$this, '_doImages_inline_callback'], $text);

	return $text;
}

public doItalicsAndBold() (defined in Markdown_Parser)

Source Code

function doItalicsAndBold($text) {
	$token_stack = [''];
	$text_stack = [''];
	$em = '';
	$strong = '';
	$tree_char_em = false;
	
	while (1) {
		#
		# Get prepared regular expression for seraching emphasis tokens
		# in current context.
		#
		$token_re = $this->em_strong_prepared_relist["$em$strong"];
		
		#
		# Each loop iteration seach for the next emphasis token. 
		# Each token is then passed to handleSpanToken.
		#
		$parts = preg_split($token_re, $text, 2, PREG_SPLIT_DELIM_CAPTURE);
		$text_stack[0] .= $parts[0];
		$token =& $parts[1];
		$text =& $parts[2];
		
		if (empty($token)) {
			# Reached end of text span: empty stack without emitting.
			# any more emphasis.
			while ($token_stack[0]) {
				$text_stack[1] .= array_shift($token_stack);
				$text_stack[0] .= array_shift($text_stack);
			}
			break;
		}
		
		$token_len = strlen($token);
		if ($tree_char_em) {
			# Reached closing marker while inside a three-char emphasis.
			if ($token_len == 3) {
				# Three-char closing marker, close em and strong.
				array_shift($token_stack);
				$span = array_shift($text_stack);
				$span = $this->runSpanGamut($span);
				$span = "<strong><em>$span</em></strong>";
				$text_stack[0] .= $this->hashPart($span);
				$em = '';
				$strong = '';
			} else {
				# Other closing marker: close one em or strong and
				# change current token state to match the other
				$token_stack[0] = str_repeat($token{0}, 3-$token_len);
				$tag = $token_len == 2 ? "strong" : "em";
				$span = $text_stack[0];
				$span = $this->runSpanGamut($span);
				$span = "<$tag>$span</$tag>";
				$text_stack[0] = $this->hashPart($span);
				$$tag = ''; # $$tag stands for $em or $strong
			}
			$tree_char_em = false;
		} else if ($token_len == 3) {
			if ($em) {
				# Reached closing marker for both em and strong.
				# Closing strong marker:
				for ($i = 0; $i < 2; ++$i) {
					$shifted_token = array_shift($token_stack);
					$tag = strlen($shifted_token) == 2 ? "strong" : "em";
					$span = array_shift($text_stack);
					$span = $this->runSpanGamut($span);
					$span = "<$tag>$span</$tag>";
					$text_stack[0] .= $this->hashPart($span);
					$$tag = ''; # $$tag stands for $em or $strong
				}
			} else {
				# Reached opening three-char emphasis marker. Push on token 
				# stack; will be handled by the special condition above.
				$em = $token{0};
				$strong = "$em$em";
				array_unshift($token_stack, $token);
				array_unshift($text_stack, '');
				$tree_char_em = true;
			}
		} else if ($token_len == 2) {
			if ($strong) {
				# Unwind any dangling emphasis marker:
				if (strlen($token_stack[0]) == 1) {
					$text_stack[1] .= array_shift($token_stack);
					$text_stack[0] .= array_shift($text_stack);
				}
				# Closing strong marker:
				array_shift($token_stack);
				$span = array_shift($text_stack);
				$span = $this->runSpanGamut($span);
				$span = "<strong>$span</strong>";
				$text_stack[0] .= $this->hashPart($span);
				$strong = '';
			} else {
				array_unshift($token_stack, $token);
				array_unshift($text_stack, '');
				$strong = $token;
			}
		} else {
			# Here $token_len == 1
			if ($em) {
				if (strlen($token_stack[0]) == 1) {
					# Closing emphasis marker:
					array_shift($token_stack);
					$span = array_shift($text_stack);
					$span = $this->runSpanGamut($span);
					$span = "<em>$span</em>";
					$text_stack[0] .= $this->hashPart($span);
					$em = '';
				} else {
					$text_stack[0] .= $token;
				}
			} else {
				array_unshift($token_stack, $token);
				array_unshift($text_stack, '');
				$em = $token;
			}
		}
	}
	return $text_stack[0];
}

public doLists() (defined in Markdown_Parser)

Source Code

function doLists($text) {
#
# Form HTML ordered (numbered) and unordered (bulleted) lists.
#
	$less_than_tab = $this->tab_width - 1;

	# Re-usable patterns to match list item bullets and number markers:
	$marker_ul_re  = '[*+-]';
	$marker_ol_re  = '\d+[.]';
	$marker_any_re = "(?:$marker_ul_re|$marker_ol_re)";

	$markers_relist = [$marker_ul_re, $marker_ol_re];

	foreach ($markers_relist as $marker_re) {
		# Re-usable pattern to match any entirel ul or ol list:
		$whole_list_re = '
			(								# $1 = whole list
			  (								# $2
				[ ]{0,'.$less_than_tab.'}
				('.$marker_re.')			# $3 = first list item marker
				[ ]+
			  )
			  (?s:.+?)
			  (								# $4
				  \z
				|
				  \n{2,}
				  (?=\S)
				  (?!						# Negative lookahead for another list item marker
					[ ]*
					'.$marker_re.'[ ]+
				  )
			  )
			)
		'; // mx
		
		# We use a different prefix before nested lists than top-level lists.
		# See extended comment in _ProcessListItems().
	
		if ($this->list_level) {
			$text = preg_replace_callback('{
					^
					'.$whole_list_re.'
				}mx',
				[&$this, '_doLists_callback'], $text);
		}
		else {
			$text = preg_replace_callback('{
					(?:(?<=\n)\n|\A\n?) # Must eat the newline
					'.$whole_list_re.'
				}mx',
				[&$this, '_doLists_callback'], $text);
		}
	}

	return $text;
}

public encodeAmpsAndAngles() (defined in Markdown_Parser)

Source Code

function encodeAmpsAndAngles($text) {
#
# Smart processing for ampersands and angle brackets that need to 
# be encoded. Valid character entities are left alone unless the
# no-entities mode is set.
#
	if ($this->no_entities) {
		$text = str_replace('&', '&amp;', $text);
	} else {
		# Ampersand-encoding based entirely on Nat Irons's Amputator
		# MT plugin: <http://bumppo.net/projects/amputator/>
		$text = preg_replace('/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/', 
							'&amp;', $text);
	}
	# Encode remaining <'s
	$text = str_replace('<', '&lt;', $text);

	return $text;
}

public encodeAttribute() (defined in Markdown_Parser)

Source Code

function encodeAttribute($text) {
#
# Encode text for a double-quoted HTML attribute. This function
# is *not* suitable for attributes enclosed in single quotes.
#
	$text = $this->encodeAmpsAndAngles($text);
	$text = str_replace('"', '&quot;', $text);
	return $text;
}

public encodeEmailAddress() (defined in Markdown_Parser)

Source Code

function encodeEmailAddress($addr) {
#
#	Input: an email address, e.g. "[email protected]"
#
#	Output: the email address as a mailto link, with each character
#		of the address encoded as either a decimal or hex entity, in
#		the hopes of foiling most address harvesting spam bots. E.g.:
#
#	  <p><a href="&#109;&#x61;&#105;&#x6c;&#116;&#x6f;&#58;&#x66;o&#111;
#        &#x40;&#101;&#x78;&#97;&#x6d;&#112;&#x6c;&#101;&#46;&#x63;&#111;
#        &#x6d;">&#x66;o&#111;&#x40;&#101;&#x78;&#97;&#x6d;&#112;&#x6c;
#        &#101;&#46;&#x63;&#111;&#x6d;</a></p>
#
#	Based by a filter by Matthew Wickline, posted to BBEdit-Talk.
#   With some optimizations by Milian Wolff.
#
	$addr = "mailto:" . $addr;
	$chars = preg_split('/(?<!^)(?!$)/', $addr);
	$seed = (int)abs(crc32($addr) / strlen($addr)); # Deterministic seed.
	
	foreach ($chars as $key => $char) {
		$ord = ord($char);
		# Ignore non-ascii chars.
		if ($ord < 128) {
			$r = ($seed * (1 + $key)) % 100; # Pseudo-random function.
			# roughly 10% raw, 45% hex, 45% dec
			# '@' *must* be encoded. I insist.
			if ($r > 90 && $char != '@') /* do nothing */;
			else if ($r < 45) $chars[$key] = '&#x'.dechex($ord).';';
			else              $chars[$key] = '&#'.$ord.';';
		}
	}
	
	$addr = implode('', $chars);
	$text = implode('', array_slice($chars, 7)); # text without `mailto:`
	$addr = "<a href=\"$addr\">$text</a>";

	return $addr;
}

public formParagraphs() (defined in Markdown_Parser)

Source Code

function formParagraphs($text) {
#
#	Params:
#		$text - string to process with html <p> tags
#
	# Strip leading and trailing lines:
	$text = preg_replace('/\A\n+|\n+\z/', '', $text);

	$grafs = preg_split('/\n{2,}/', $text, -1, PREG_SPLIT_NO_EMPTY);

	#
	# Wrap <p> tags and unhashify HTML blocks
	#
	foreach ($grafs as $key => $value) {
		if (!preg_match('/^B\x1A[0-9]+B$/', $value)) {
			# Is a paragraph.
			$value = $this->runSpanGamut($value);
			$value = preg_replace('/^([ ]*)/', "<p>", $value);
			$value .= "</p>";
			$grafs[$key] = $this->unhash($value);
		}
		else {
			# Is a block.
			# Modify elements of @grafs in-place...
			$graf = $value;
			$block = $this->html_hashes[$graf];
			$graf = $block;
/				if (preg_match('{
/					\A
/					(							# $1 = <div> tag
/					  <div  \s+
/					  [^>]*
/					  \b
/					  markdown\s*=\s*  ([\'"])	#	$2 = attr quote char
/					  1
/					  \2
/					  [^>]*
/					  >
/					)
/					(							# $3 = contents
/					.*
/					)
/					(</div>)					# $4 = closing tag
/					\z
/					}xs', $block, $matches))
/				{
/					list(, $div_open, , $div_content, $div_close) = $matches;
/
/					# We can't call Markdown(), because that resets the hash;
/					# that initialization code should be pulled into its own sub, though.
/					$div_content = $this->hashHTMLBlocks($div_content);
/					
/					# Run document gamut methods on the content.
/					foreach ($this->document_gamut as $method => $priority) {
/						$div_content = $this->$method($div_content);
/					}
/
/					$div_open = preg_replace(
/						'{\smarkdown\s*=\s*([\'"]).+?\1}', '', $div_open);
/
/					$graf = $div_open . "\n" . $div_content . "\n" . $div_close;
/				}
			$grafs[$key] = $graf;
		}
	}

	return implode("\n\n", $grafs);
}

public handleSpanToken() (defined in Markdown_Parser)

Source Code

function handleSpanToken($token, &$str) {
#
# Handle $token provided by parseSpan by determining its nature and 
# returning the corresponding value that should replace it.
#
	switch ($token{0}) {
		case "\\":
			return $this->hashPart("&#". ord($token{1}). ";");
		case "`":
			# Search for end marker in remaining text.
			if (preg_match('/^(.*?[^`])'.preg_quote($token).'(?!`)(.*)$/sm', 
				$str, $matches))
			{
				$str = $matches[2];
				$codespan = $this->makeCodeSpan($matches[1]);
				return $this->hashPart($codespan);
			}
			return $token; // return as text since no ending marker found.
		default:
			return $this->hashPart($token);
	}
}

public hashBlock() (defined in Markdown_Parser)

Source Code

function hashBlock($text) {
#
# Shortcut function for hashPart with block-level boundaries.
#
	return $this->hashPart($text, 'B');
}

public hashHTMLBlocks() (defined in Markdown_Parser)

Source Code

function hashHTMLBlocks($text) {
	if ($this->no_markup)  return $text;

	$less_than_tab = $this->tab_width - 1;

	# Hashify HTML blocks:
	# We only want to do this for block-level HTML tags, such as headers,
	# lists, and tables. That's because we still want to wrap <p>s around
	# "paragraphs" that are wrapped in non-block-level tags, such as anchors,
	# phrase emphasis, and spans. The list of tags we're looking for is
	# hard-coded:
	#
	# *  List "a" is made of tags which can be both inline or block-level.
	#    These will be treated block-level when the start tag is alone on 
	#    its line, otherwise they're not matched here and will be taken as 
	#    inline later.
	# *  List "b" is made of tags which are always block-level;
	#
	$block_tags_a_re = 'ins|del';
	$block_tags_b_re = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|address|'.
					   'script|noscript|form|fieldset|iframe|math';

	# Regular expression for the content of a block tag.
	$nested_tags_level = 4;
	$attr = '
		(?>				# optional tag attributes
		  \s			# starts with whitespace
		  (?>
			[^>"/]+		# text outside quotes
		  |
			/+(?!>)		# slash not followed by ">"
		  |
			"[^"]*"		# text inside double quotes (tolerate ">")
		  |
			\'[^\']*\'	# text inside single quotes (tolerate ">")
		  )*
		)?	
		';
	$content =
		str_repeat('
			(?>
			  [^<]+			# content without tag
			|
			  <\2			# nested opening tag
				'.$attr.'	# attributes
				(?>
				  />
				|
				  >', $nested_tags_level).	# end of opening tag
				  '.*?'.					# last level nested tag content
		str_repeat('
				  </\2\s*>	# closing nested tag
				)
			  |				
				<(?!/\2\s*>	# other tags with a different name
			  )
			)*',
			$nested_tags_level);
	$content2 = str_replace('\2', '\3', $content);

	# First, look for nested blocks, e.g.:
	# 	<div>
	# 		<div>
	# 		tags for inner block must be indented.
	# 		</div>
	# 	</div>
	#
	# The outermost tags must start at the left margin for this to match, and
	# the inner nested divs must be indented.
	# We need to do this before the next, more liberal match, because the next
	# match will start at the first `<div>` and stop at the first `</div>`.
	$text = preg_replace_callback('{(?>
		(?>
			(?<=\n\n)		# Starting after a blank line
			|				# or
			\A\n?			# the beginning of the doc
		)
		(						# save in $1

		  # Match from `\n<tag>` to `</tag>\n`, handling nested tags 
		  # in between.
				
					[ ]{0,'.$less_than_tab.'}
					<('.$block_tags_b_re.')# start tag = $2
					'.$attr.'>			# attributes followed by > and \n
					'.$content.'		# content, support nesting
					</\2>				# the matching end tag
					[ ]*				# trailing spaces/tabs
					(?=\n+|\Z)	# followed by a newline or end of document

		| # Special version for tags of group a.

					[ ]{0,'.$less_than_tab.'}
					<('.$block_tags_a_re.')# start tag = $3
					'.$attr.'>[ ]*\n	# attributes followed by >
					'.$content2.'		# content, support nesting
					</\3>				# the matching end tag
					[ ]*				# trailing spaces/tabs
					(?=\n+|\Z)	# followed by a newline or end of document
				
		| # Special case just for <hr />. It was easier to make a special 
		  # case than to make the other regex more complicated.
		
					[ ]{0,'.$less_than_tab.'}
					<(hr)				# start tag = $2
					'.$attr.'			# attributes
					/?>					# the matching end tag
					[ ]*
					(?=\n{2,}|\Z)		# followed by a blank line or end of document
		
		| # Special case for standalone HTML comments:
		
				[ ]{0,'.$less_than_tab.'}
				(?s:
					<!-- .*? -->
				)
				[ ]*
				(?=\n{2,}|\Z)		# followed by a blank line or end of document
		
		| # PHP and ASP-style processor instructions (<? and <%)
		
				[ ]{0,'.$less_than_tab.'}
				(?s:
					<([?%])			# $2
					.*?
					\2>
				)
				[ ]*
				(?=\n{2,}|\Z)		# followed by a blank line or end of document
				
		)
		)}Sxmi',
		[&$this, '_hashHTMLBlocks_callback'],
		$text);

	return $text;
}

public hashPart() (defined in Markdown_Parser)

Source Code

function hashPart($text, $boundary = 'X') {
#
# Called whenever a tag must be hashed when a function insert an atomic 
# element in the text stream. Passing $text to through this function gives
# a unique text-token which will be reverted back when calling unhash.
#
# The $boundary argument specify what character should be used to surround
# the token. By convension, "B" is used for block elements that needs not
# to be wrapped into paragraph tags at the end, ":" is used for elements
# that are word separators and "X" is used in the general case.
#
	# Swap back any tag hash found in $text so we do not have to `unhash`
	# multiple times at the end.
	$text = $this->unhash($text);
	
	# Then hash the block.
	static $i = 0;
	$key = "$boundary\x1A" . ++$i . $boundary;
	$this->html_hashes[$key] = $text;
	return $key; # String that will replace the tag.
}

public makeCodeSpan() (defined in Markdown_Parser)

Source Code

function makeCodeSpan($code) {
#
# Create a code span markup for $code. Called from handleSpanToken.
#
	$code = htmlspecialchars(trim($code), ENT_NOQUOTES);
	return $this->hashPart("<code>$code</code>");
}

public outdent() (defined in Markdown_Parser)

Source Code

function outdent($text) {
#
# Remove one level of line-leading tabs or spaces
#
	return preg_replace('/^(\t|[ ]{1,'.$this->tab_width.'})/m', '', $text);
}

public parseSpan() (defined in Markdown_Parser)

Source Code

function parseSpan($str) {
#
# Take the string $str and parse it into tokens, hashing embeded HTML,
# escaped characters and handling code spans.
#
	$output = '';
	
	$span_re = '{
			(
				\\\\'.$this->escape_chars_re.'
			|
				(?<![`\\\\])
				`+						# code span marker
		'.( $this->no_markup ? '' : '
			|
				<!--    .*?     -->		# comment
			|
				<\?.*?\?> | <%.*?%>		# processing instruction
			|
				<[/!$]?[-a-zA-Z0-9:]+	# regular tags
				(?>
					\s
					(?>[^"\'>]+|"[^"]*"|\'[^\']*\')*
				)?
				>
		').'
			)
			}xs';

	while (1) {
		#
		# Each loop iteration seach for either the next tag, the next 
		# openning code span marker, or the next escaped character. 
		# Each token is then passed to handleSpanToken.
		#
		$parts = preg_split($span_re, $str, 2, PREG_SPLIT_DELIM_CAPTURE);
		
		# Create token from text preceding tag.
		if ($parts[0] != "") {
			$output .= $parts[0];
		}
		
		# Check if we reach the end.
		if (isset($parts[1])) {
			$output .= $this->handleSpanToken($parts[1], $parts[2]);
			$str = $parts[2];
		}
		else {
			break;
		}
	}
	
	return $output;
}

public prepareItalicsAndBold() (defined in Markdown_Parser)

Source Code

function prepareItalicsAndBold() {
#
# Prepare regular expressions for seraching emphasis tokens in any
# context.
#
	foreach ($this->em_relist as $em => $em_re) {
		foreach ($this->strong_relist as $strong => $strong_re) {
			# Construct list of allowed token expressions.
			$token_relist = [];
			if (isset($this->em_strong_relist["$em$strong"])) {
				$token_relist[] = $this->em_strong_relist["$em$strong"];
			}
			$token_relist[] = $em_re;
			$token_relist[] = $strong_re;
			
			# Construct master expression from list.
			$token_re = '{('. implode('|', $token_relist) .')}';
			$this->em_strong_prepared_relist["$em$strong"] = $token_re;
		}
	}
}

public processListItems() (defined in Markdown_Parser)

Source Code

function processListItems($list_str, $marker_any_re) {
#
#	Process the contents of a single ordered or unordered list, splitting it
#	into individual list items.
#
	# The $this->list_level global keeps track of when we're inside a list.
	# Each time we enter a list, we increment it; when we leave a list,
	# we decrement. If it's zero, we're not in a list anymore.
	#
	# We do this because when we're not inside a list, we want to treat
	# something like this:
	#
	#		I recommend upgrading to version
	#		8. Oops, now this line is treated
	#		as a sub-list.
	#
	# As a single paragraph, despite the fact that the second line starts
	# with a digit-period-space sequence.
	#
	# Whereas when we're inside a list (or sub-list), that line will be
	# treated as the start of a sub-list. What a kludge, huh? This is
	# an aspect of Markdown's syntax that's hard to parse perfectly
	# without resorting to mind-reading. Perhaps the solution is to
	# change the syntax rules such that sub-lists must start with a
	# starting cardinal number; e.g. "1." or "a.".
	
	$this->list_level++;

	# trim trailing blank lines:
	$list_str = preg_replace("/\n{2,}\\z/", "\n", $list_str);

	$list_str = preg_replace_callback('{
		(\n)?							# leading line = $1
		(^[ ]*)							# leading whitespace = $2
		('.$marker_any_re.'				# list marker and space = $3
			(?:[ ]+|(?=\n))	# space only required if item is not empty
		)
		((?s:.*?))						# list item text   = $4
		(?:(\n+(?=\n))|\n)				# tailing blank line = $5
		(?= \n* (\z | \2 ('.$marker_any_re.') (?:[ ]+|(?=\n))))
		}xm',
		[&$this, '_processListItems_callback'], $list_str);

	$this->list_level--;
	return $list_str;
}

public runBasicBlockGamut() (defined in Markdown_Parser)

Source Code

function runBasicBlockGamut($text) {
#
# Run block gamut tranformations, without hashing HTML blocks. This is 
# useful when HTML blocks are known to be already hashed, like in the first
# whole-document pass.
#
	foreach ($this->block_gamut as $method => $priority) {
		$text = $this->$method($text);
	}
	
	# Finally form paragraph and restore hashed blocks.
	$text = $this->formParagraphs($text);

	return $text;
}

public runBlockGamut() (defined in Markdown_Parser)

Source Code

function runBlockGamut($text) {
#
# Run block gamut tranformations.
#
	# We need to escape raw HTML in Markdown source before doing anything 
	# else. This need to be done for each block, and not only at the 
	# begining in the Markdown function since hashed blocks can be part of
	# list items and could have been indented. Indented blocks would have 
	# been seen as a code block in a previous pass of hashHTMLBlocks.
	$text = $this->hashHTMLBlocks($text);
	
	return $this->runBasicBlockGamut($text);
}

public runSpanGamut() (defined in Markdown_Parser)

Source Code

function runSpanGamut($text) {
#
# Run span gamut tranformations.
#
	foreach ($this->span_gamut as $method => $priority) {
		$text = $this->$method($text);
	}

	return $text;
}

public setup() (defined in Markdown_Parser)

Source Code

function setup() {
#
# Called before the transformation process starts to setup parser 
# states.
#
	# Clear global hashes.
	$this->urls = $this->predef_urls;
	$this->titles = $this->predef_titles;
	$this->html_hashes = [];
	
	$in_anchor = false;
}

public stripLinkDefinitions() (defined in Markdown_Parser)

Source Code

function stripLinkDefinitions($text) {
#
# Strips link definitions from text, stores the URLs and titles in
# hash references.
#
	$less_than_tab = $this->tab_width - 1;

	# Link defs are in the form: ^[id]: url "optional title"
	$text = preg_replace_callback('{
						^[ ]{0,'.$less_than_tab.'}\[(.+)\][ ]?:	# id = $1
						  [ ]*
						  \n?				# maybe *one* newline
						  [ ]*
						<?(\S+?)>?			# url = $2
						  [ ]*
						  \n?				# maybe one newline
						  [ ]*
						(?:
							(?<=\s)			# lookbehind for whitespace
							["(]
							(.*?)			# title = $3
							[")]
							[ ]*
						)?	# title is optional
						(?:\n+|\Z)
		}xm',
		[&$this, '_stripLinkDefinitions_callback'],
		$text);
	return $text;
}

public teardown() (defined in Markdown_Parser)

Source Code

function teardown() {
#
# Called after the transformation process to clear any variable 
# which may be taking up memory unnecessarly.
#
	$this->urls = [];
	$this->titles = [];
	$this->html_hashes = [];
}

public transform() (defined in Markdown_Parser)

Source Code

function transform($text) {
#
# Main function. Performs some preprocessing on the input text
# and pass it through the document gamut.
#
	$this->setup();

	# Remove UTF-8 BOM and marker character in input, if present.
	$text = preg_replace('{^\xEF\xBB\xBF|\x1A}', '', $text);

	# Standardize line endings:
	#   DOS to Unix and Mac to Unix
	$text = preg_replace('{\r\n?}', "\n", $text);

	# Make sure $text ends with a couple of newlines:
	$text .= "\n\n";

	# Convert all tabs to spaces.
	$text = $this->detab($text);

	# Turn block-level HTML blocks into hash entries
	$text = $this->hashHTMLBlocks($text);

	# Strip any lines consisting only of spaces and tabs.
	# This makes subsequent regexen easier to write, because we can
	# match consecutive blank lines with /\n+/ instead of something
	# contorted like /[ ]*\n+/ .
	$text = preg_replace('/^[ ]+$/m', '', $text);

	# Run document gamut methods.
	foreach ($this->document_gamut as $method => $priority) {
		$text = $this->$method($text);
	}
	
	$this->teardown();

	return $text . "\n";
}

public unhash() (defined in Markdown_Parser)

Source Code

function unhash($text) {
#
# Swap back in all the tags hashed by _HashHTMLBlocks.
#
	return preg_replace_callback('/(.)\x1A[0-9]+\1/', 
		[&$this, '_unhash_callback'], $text);
}

Do you want to contribute to Koseven?

We need YOUR help!

This project is open source. What does this mean? YOU can help:
  • Found a bug? Report it on Github
  • Need a feature? Add it Here
  • Want to help? Join the Forum
Go to Github