13 files changed, 423 insertions, 212 deletions
diff --git a/it.class b/it.class
index 624f601..1be8f7c 100644
--- a/it.class
+++ b/it.class
@@ -123,6 +123,7 @@ static function timerlog($label = '')
  * @param $p['graceperiod'] number of seconds within which additional errors are ignored if id is set
  * @param $p['timewindow'] number of seconds after graceperiod within which the second error must occur if id is set
  * @param $p['backtraceskip'] number of stack levels to drop
+ * @param $p['skipfiles'] files to skip in backtrace
  * @param $p['blockmail'] number of seconds to block mails after having sent a mail [3600]
  * @param $p['blockmailid'] block mail for $p['blockmail'] seconds with same id. Default: $p['to']
  * @param $p['omitdebuginfo'] Do not add stack dump, locals and environment to output [false]
@@ -193,7 +194,7 @@ static function error($p = array(), $body = null, $to = null) # $body and $to de
 
 	if ($toscreen || $sendmail)
 	{
-		$trace = it_debug::backtrace($p['backtraceskip']); # moved in here for performance in mass error case
+		$trace = it_debug::backtrace(array('skiplevels' => $p['backtraceskip'], 'skipfiles' => $p['skipfiles'])); # moved in here for performance in mass error case
 
 		if (strlen($p['body']) > 500000)
 		{
@@ -230,7 +231,7 @@ static function error($p = array(), $body = null, $to = null) # $body and $to de
 			it::mail(array('To' => $p['to'], 'Subject' => substr($p['title'], 0, 80), 'Body' => $body) + (($cc = $GLOBALS['it_defaultconfig']['error_cc']) ? array('Cc' => $cc) : array()));
 		}
 		else if ($_SERVER['REMOTE_ADDR']) # toscreen mode: web
-			echo "<pre>{$p['title']}\n".rtrim($body)."</pre>";
+			echo "<pre>" . htmlspecialchars($p['title'] . "\n" . rtrim($body), ENT_COMPAT, "iso-8859-1") . "</pre>"; # works with iso-8859-1 or utf-8, UTF8SAFE
 		else  # toscreen mode: shell (outputs to stderr)
 			error_log($p['title'] . " in " . ($trace ? $trace : "{$p['file']}:{$p['line']} Url: $url") . " " . (EDC('verbose') ? D($p['locals']) : ""));
 	}
@@ -334,7 +335,6 @@ static function convertregex($pattern, $p = null)
  * @param $string String to match
  * @param $p['offset_capture'] Set flag preg_offset_capture (returns offsets with the matches).
  * @param $p['all'] Return every match as array instead of first match.
- * @param $p['locale'] Use given locale (default: de_CH), mainly affects handling of iso-latin chars
  * @param $p contains pattern modifiers, @see convertregex()
  * @return Matched string or false 
  */
@@ -346,21 +346,18 @@ static function match($pattern, $string, $p = null)
 	{
 		$flags = $p['offset_capture'] ? PREG_OFFSET_CAPTURE : 0;
 
-		$oldlocale = setlocale(LC_CTYPE, 0);
-		setlocale(LC_CTYPE, $p['locale'] ? $p['locale'] : "de_CH");
-
 		if ($p['all'])
 			$r = preg_match_all(it::convertregex($pattern, $p), $string, $m, $flags | PREG_PATTERN_ORDER, $p['offset']);
 		else
 			$r = preg_match(it::convertregex($pattern, $p), $string, $m, $flags, $p['offset']);
-
-		setlocale(LC_CTYPE, $oldlocale);
 	}
 
 	if (!$r)	# no match
 	{
 		if (preg_last_error() == PREG_BACKTRACK_LIMIT_ERROR)
 			it::error("Exceeded pcre.backtrack_limit of " . ini_get('pcre.backtrack_limit') . " bytes");
+		else if (preg_last_error() == PREG_BAD_UTF8_ERROR)
+			it::error("Input to it::match is not valid utf-8");
 
 		$result = $p['all'] ? array() : null;
 	}
@@ -384,23 +381,66 @@ static function match($pattern, $string, $p = null)
  */
 static function replace($replacements, $string, $p = array())
 {
+	$encoding = ini_get('default_charset') == 'utf-8' ? 'u' : '';
 	foreach ($replacements as $pattern => $dummy)
-		$patterns[] = !preg_match('/\\\\[wb]|[!\x80-\xff]|\[\[:/i', $pattern) && !$p ? "!$pattern!i" : it::convertregex($complex = $pattern, $p);
+		$patterns[] = !preg_match('/\\\\[wb]|[!\x80-\xff]|\[\[:/i', $pattern) && !$p ? "!$pattern!i$encoding" : it::convertregex($pattern, $p);
+
+	$result = preg_replace($patterns, $replacements, $string, isset($p['limit']) ? $p['limit'] : -1);
+
+	 if ($result === null && preg_last_error() == PREG_BAD_UTF8_ERROR)
+		it::error("Input to it::replace is not valid utf-8");
 
-	if (!$complex && !$p)
-		$result = preg_replace($patterns, $replacements, $string);
+	return $result;
+}
+
+/**
+ * Returns only the array elements matching the given regex
+ * @param $pattern Regex to match against
+ * @param $array   array to grep
+ * @return New array
+ */
+static function grep($pattern, $array, $p = array())
+{
+	if (!preg_match('/\\\\[wb]|[!\x80-\xff]|\[\[:/i', $pattern) && !$p)
+		$result = preg_grep('!' . $pattern . '!i' . (ini_get('default_charset') == 'utf-8' ? 'u' : ''), $array); # fast path for simple patterns
 	else
-	{
-		$oldlocale = setlocale(LC_CTYPE, 0);
-		setlocale(LC_CTYPE, 'de_CH');
-		$result = preg_replace($patterns, $replacements, $string, isset($p['limit']) ? $p['limit'] : -1);
-		setlocale(LC_CTYPE, $oldlocale);
-	}
+		$result = preg_grep(it::convertregex($pattern, $p), $array);
 
 	return $result;
 }
 
 /**
+ * Convert string to utf8 if it was not already utf-8 before
+ * @param $value String to convert
+ * @return Same string in utf-8 encoding
+ */
+function any2utf8($value)
+{
+	return grapheme_strlen($value) === null ? utf8_encode($value) : $value;
+}
+
+/**
+ * Uppercase first character similar to ucfirst() but for mbstring.internal_encoding
+ */
+static function ucfirst($string)
+{
+	return mb_strtoupper(mb_substr($string, 0, 1)) . mb_substr($string, 1);
+}
+
+/**
+ * Uppercase first character of each word similar to ucwords() but for mbstring.internal_encoding
+ */
+static function ucwords($string)
+{
+	return preg_replace_callback('/\b\w/u', function($m) { return mb_strtoupper($m[0]); }, mb_strtolower($string));
+}
+
+static function substr_replace($string, $replacement, $start, $length)
+{
+	return grapheme_substr($string, 0, $start) . $replacement . grapheme_substr($string, $start + $length);
+} 
+
+/**
  * Extract key => value pairs from assoc array by key
  * @param $array array to filter
  * @param $keys array or comma separated list of keys to keep
diff --git a/it_dbi.class b/it_dbi.class
index ba0606c..ced925b 100644
--- a/it_dbi.class
+++ b/it_dbi.class
@@ -247,7 +247,11 @@ function _set($tags, $allfields = false)
 		if (substr($field, 0, 1) == '-')		# Unquoted value (always added)
 			$r[] = substr($field, 1)."=$value";
 		else if ($allfields || ($value !== $this->_data[$field]))
+		{
+			if ($this->_p['charset'] == "utf8")	# NOTE: Mysql charset is simply utf8, not utf-8
+				$value = it_html::fix_encoding($value);
 			$r[] = "`$field`=".(isset($value) ? $this->escape_string($value) : 'NULL');
+		}
 	}
 
 	return $r ? 'SET '.implode(', ', $r) : '';
diff --git a/it_html.class b/it_html.class
index 92aa9ba..4c9f3e7 100644
--- a/it_html.class
+++ b/it_html.class
@@ -38,7 +38,7 @@ function it_html($p = array())
 {
 	# Default configuration of html class
 	$this->p = $p + array(
-		'charset' => 'iso-8859-1',
+		'charset' => ini_get('default_charset') ?: 'iso-8859-1',
 		'doctype' => null,                    # Custom doctype (will usually be calculated from htmltype)
 		'head' => '',                         # Code to put into head() section
 		'htmltype' => 'xhtml',                # 'html' (=old-style), 'xhtml' or 'xhtml-mobile'
@@ -48,7 +48,7 @@ function it_html($p = array())
 		'name' => 'it_html',                  # Name of global variable $this is assigned to (string), XXX Copy and paste in configure() to keep PHP4 compatibility
 		'nonewlinetags' => 'a,b,em,img,input,label,span,noscript', # tags that do not like newlines after them
 		'notexported' => 'configure,sanitize',# Those methods are not exported
-		'prettyprint' => false,               # Should output be prettily indented?
+		'prettyprint' => it::is_devel(),      # Should output be prettily indented?
 		'show_boot_dom' => false,             # If true, append invisible <div id="it_boot_dom"> at the end of body
 		'show_content_type' => true,          # If true, add <meta http-equiv="Content-Type" ...> header
 		'show_favicon' => true,               # If true, add <link> tag to /favicon.ico if it exists
@@ -258,6 +258,21 @@ function _parse_args($args)
 }
 
 
+# internal
+function fix_encoding($string, $silent = false)
+{
+	if (grapheme_strlen($string) === null)
+		list($string, $error) = array(utf8_encode($string), utf8_encode("incorrectly utf8-encoded: " . trim($string)));
+	else if ($string && preg_match('/[\x80-\xff]/', $string) && grapheme_strlen(utf8_decode($string)) !== null && utf8_encode(utf8_decode($string)) === $string)
+		list($string, $error) = array(utf8_decode($string), utf8_encode("doubly utf8-encoded: " . trim($string)));
+
+	if ($error && !$silent)
+		it::error(array('title' => $error, 'skipfiles' => "it_html"));
+
+	return $string;
+}
+
+
 /**
  * function div($args...)
  * Return a <div>...</div> element
@@ -312,10 +327,12 @@ function _tag($name, $args)
 	else
 		$result .= " />$newline";
 
+	if ($GLOBALS['debug_utf8check'] && $GLOBALS['it_html']->p['charset'] == "utf-8")
+		$result = self::fix_encoding($result);
+
 	return $result;
 }
 
-
 /**
  * Return a <tag> containing optional data.
  * @param $name tag name ('style', etc.)
@@ -414,9 +431,11 @@ function _strip_tags($html)
 function sanitize($html)
 {
 	$result = "";
+	$charset = $GLOBALS['it_html']->p['charset'] ? $GLOBALS['it_html']->p['charset'] : 'iso-8859-1';
+	if ($charset == "utf-8")
+		$html = it::any2utf8($html);
 	$html = it::replace(array('[\0\s]+' => " "), $html);	# \s also matches \r and \n
 	$urlpattern = 'https?://[^">]+';
-	$charset = $GLOBALS['it_html']->p['charset'] ? $GLOBALS['it_html']->p['charset'] : 'iso-8859-1';
 
 	if ($tag = it::match("(.*)<(div|p|i|b)\b[^>]*>(.*?)</\\2>(.*)", $html))
 	{
@@ -450,11 +469,11 @@ function sanitize($html)
 }
 
 /**
- * Decode all entities, ensure latin-1 encoding
+ * Decode all entities to encoding set for it_html
  */
 function entity_decode($string)
 {
-	$string = preg_replace('/&#(8217|65533);/', "'", html_entity_decode($string));
+	$string = preg_replace('/&#(8217|65533);/', "'", html_entity_decode($string, ENT_COMPAT, $GLOBALS['it_html']->p['charset']));
 	$string = preg_replace_callback('/&#x0*([0-9a-f]+);/i', function($m) { return hexdec($m[1]) <= 255 ? chr(hexdec($m[1])) : " "; }, $string);
 	$string = preg_replace_callback('/&#0*([0-9]+);/', function($m) { return $m[1]  <= 255 ? chr($m[1]) : " "; }, $string);
 
@@ -476,8 +495,13 @@ function latinize($string)
  */
 function Q($string)
 {
-	if (preg_match('/[<>&"\x00-\x08\x0a-\x0c\x0e-\x1f\x80-\x9f]/', $string)) # WARNING: copy/pasted to _tag()
+	if (preg_match('/[<>&"\x00-\x08\x0a-\x0c\x0e-\x1f\x80-\xff]/', $string)) # WARNING: copy/pasted to _tag()
+	{
+		if ($GLOBALS['debug_utf8check'] && $GLOBALS['it_html']->p['charset'] == "utf-8")
+			$string = self::fix_encoding($string);
+
 		$string = htmlspecialchars($GLOBALS['it_html']->p['charset'] == "iso-8859-1" ? it_html::latinize($string) : $string, ENT_COMPAT, $GLOBALS['it_html']->p['charset']);
+	}
 
 	return $GLOBALS['debug_q'] && $string ? "<span style='background:#8FF'>$string</span>" : $string;
 }
diff --git a/it_xml.class b/it_xml.class
index 0679c69..f854682 100644
--- a/it_xml.class
+++ b/it_xml.class
@@ -29,7 +29,7 @@ class it_xml
  * @param $p associative array
  * @param $p['forcearray'] xml tags to ALWAYS return as array
  * @param $p['safety'] 2 causes program abort with invalid xml, 1 (default) causes error report, 0 just returns false
- * @param $p['encoding'] Output character encoding (e.g. UTF-8, default: ISO-8859-1)
+ * @param $p['encoding'] Output character encoding (utf-8, iso-8859-1 or us-ascii, default: ini_get('default_charset')
  * @param $p['prefix'] Optional prefix for class names
  * @param $p['lowercase'] Lowercase all tag and attribute names
  * @return XML object tree or null on failure
@@ -49,23 +49,22 @@ function it_xml($xmldata = "", $p = array())
 function create($xmldata, $p = array())
 {
 	$xml = new it_xml;
-
 	return $xml->from_xml($xmldata, array('factory' => true) + $p) ? $xml->_root : null;
 }
 
 function from_xml($xmldata, $p)
 {
-	$this->_p = $p + array('encoding' => "ISO-8859-1", 'safety' => 1);
+	$this->_p = $p + array('encoding' => ini_get('default_charset'), 'safety' => 1);
 	$this->_arrayforce = array_flip((array)$this->_p['forcearray']);
 	$this->_stack = array();
 	unset($this->error);
-	$parser = xml_parser_create($this->_p['encoding']);
+	$parser = xml_parser_create();
 	xml_set_object($parser, $this);
 	xml_set_element_handler($parser, "start_element", "end_element");
 	xml_set_character_data_handler($parser, "character_data");
 	xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, 0);
 	xml_parser_set_option($parser, XML_OPTION_TARGET_ENCODING, $this->_p['encoding']);
-	
+
 	$result = true;
 
 	if (is_resource($xmldata))
@@ -123,7 +122,7 @@ function _sanitize($xmldata, $isutf8 = null)
 		if (!preg_match('/^(<\?xml|\xEF\xBB\xBF|\xFE\xFF|\xFF\xFE|\x00\x00\xFE\xFF|\x00\x00\xFF\xFE)/', $xmldata))
 			$xmldata = '<?xml version="1.0" encoding="' . $this->_p['encoding'] . '"?>' . $xmldata;
 
-		$isutf8 = (!preg_match('/^<\?xml[^>]* encoding=/i', $xmldata) || preg_match('/^<\?xml[^>]* encoding=.UTF-8/i', $xmldata));
+		$isutf8 = (!preg_match('/^<\?xml[^>]* encoding=/i', $xmldata) || preg_match('/^<\?xml[^>]* encoding=.utf-8/i', $xmldata));
 	}
 
 	# Decode illegal entities but protect semantically important ones
diff --git a/itjs.class b/itjs.class
index a7a6a15..e41ffe5 100644
--- a/itjs.class
+++ b/itjs.class
@@ -23,12 +23,18 @@
 
 class itjs
 {
+static $charset;
 
 /**
  * Send HTTP headers (content-type) to transmit javascript code
  */
-function send_headers($charset = 'iso-8859-1')
+function send_headers($charset = null)
 {
+	if (!$charset)
+		$charset = ini_get('default_charset') ?: 'iso-8859-1';
+
+	self::$charset = $charset;
+
 	if (!preg_match('/Opera/', $_SERVER['HTTP_USER_AGENT']) && !$_REQUEST['itjs_iframe'])	# text/plain breaks Opera 8.51/Linux and IFrame fallback
 		header("Content-Type: text/plain; charset=$charset");	# Berni reported some Firewalls to require this
 
@@ -67,16 +73,17 @@ function serialize($values, $envelope = false)
  */
 function encode($values)
 {
-	$texts = ($values === array_values($values)) ? "[]0 " : "{}1\n";	# Numerical or associative array
 	static $jskeyword = array("abstract" => 1, "boolean" => 1, "break" => 1, "byte" => 1, "case" => 1, "catch" => 1, "char" => 1, "class" => 1, "const" => 1, "continue" => 1, "debugger" => 1, "default" => 1, "delete" => 1, "do" => 1, "double" => 1, "each" => 1, "else" => 1, "enum" => 1, "export" => 1, "extends" => 1, "false" => 1, "final" => 1, "finally" => 1, "float" => 1, "for" => 1, "function" => 1, "goto" => 1, "if" => 1, "implements" => 1, "import" => 1, "in" => 1, "instanceof" => 1, "int" => 1, "interface" => 1, "long" => 1, "namespace" => 1, "native" => 1, "new" => 1, "null" => 1, "package" => 1, "private" => 1, "protected" => 1, "public" => 1, "return" => 1, "short" => 1, "static" => 1, "super" => 1, "switch" => 1, "synchronized" => 1, "this" => 1, "throw" => 1, "throws" => 1, "transient" => 1, "true" => 1, "try" => 1, "typeof" => 1, "var" => 1, "void" => 1, "volatile" => 1, "while" => 1, "with" => 1, "xml" => 1);
 
-	$result = $texts{0};
+	$charset = self::$charset ?: ini_get('default_charset');
+	$texts = ($values === array_values($values)) ? "[]0 " : "{}1\n";	# Numerical or associative array
+	$result = $texts[0];
 
 	foreach ($values as $key => $value)
 	{
 		$result .= $separator;
 
-		if ($texts{2})
+		if ($texts[2])
 		{
 			if ($jskeyword[$key] || !preg_match('/^[a-z_]\w*$/i', $key))
 				$key = "'$key'";
@@ -91,17 +98,19 @@ function encode($values)
 		else if (!is_array($value))
 		{
 			$quote = (strval(intval($value)) === strval($value)) ? "" : '"';
-			$string = strtr($value, array("\0" => '\\0', "\x84" => '\\"', "\x93" => '\\"',"\x94" => '\\"', '"' => '\\"', "</"=>"<\\/", "\n" => '\\n', "\r" => '\\r', "\t" => '\\t', "\\" => '\\\\'));
+			if (strtolower($charset) != "utf-8")
+				$value = strtr($value, array("\x84" => '"', "\x93" => '"', "\x94" => '"'));
+			$string = strtr($value, array("\0" => '\\0', '"' => '\\"', "</"=>"<\\/", "\n" => '\\n', "\r" => '\\r', "\t" => '\\t', "\\" => '\\\\'));
 			$string = $GLOBALS['itjs_defaultconfig']['latin2unicode'] ? preg_replace_callback('/([\xa0-\xff])/', function($m) { return sprintf("\\u%04x", ord($m[1])); }, $string) : $string;
 			$result .= $quote . $string . $quote;
 		}
 		else
 			$result .= itjs::encode($value);
 
-		$separator = "," . $texts{3};
+		$separator = "," . $texts[3];
 	}
 
-	$result .= $texts{1};
+	$result .= $texts[1];
 
 	return $result;
 }
diff --git a/itjs.php b/itjs.php
index 03073e9..f819cab 100644
--- a/itjs.php
+++ b/itjs.php
@@ -33,7 +33,7 @@ foreach ($files as $file)
 {
 	ob_start();	# Needs to capture inside loop to guarantee file order
 	if (!(it::match('W3C_CSS_Validator', $_SERVER['HTTP_USER_AGENT']) && it::match('jquery-ui\.css', $file)))
-		$data .= it::replace(array('^1$' => ""), @include($file));
+		$data .= it::replace(array('^1$' => ""), @include($file), array('utf8' => false));
 	$data .= ob_get_clean();
 }
 
@@ -50,9 +50,8 @@ else if (it::match('\.css', $_SERVER['PHP_SELF']))
 	$data .= "\n#it_boot_dom { display:none }\n";	# Append magic style for it_boot
 	if (!it::match('^devel', $GLOBALS['ULTRASERVERTYPE']))
 		$data = it::replace(array('[ \t]*([{};])[ \t]*' => '$1', '/\*.*?\*/' => ""), $data);
-	if (it::match('W3C_CSS_Validator', $_SERVER['HTTP_USER_AGENT']) || EDC('w3c')) {
+	if (it::match('W3C_CSS_Validator', $_SERVER['HTTP_USER_AGENT']) || EDC('w3c'))
 		$data = it::replace(array('@-.*' => "", 'background[^;}]*(gradient|rgba)[^;}]*;?' => "", '(filter:\s*(progid|none)|text-overflow:|zoom:|-webkit-|display:-moz-|-moz-|-o-|cursor:|border-radius:|behavior:|\w+:expression)[^;}]*;?' => "", 'html\.ie6.*' => "", '([^/])\*(\w)' => '$1$2'), $data);
-	}
 }
 else if (it::match('\.htc$', $file))
 {
@@ -64,7 +63,8 @@ else if (!it::match('\.html$', $file))
 	if ($_REQUEST['boot'] && !$_REQUEST['retry'])
 		ob_start('ob_gzhandler');
 
-	header("Content-Type: application/x-javascript; charset=iso-8859-1");
+	$charset = ini_get('default_charset') ?: 'iso-8859-1';
+	header("Content-Type: application/x-javascript; charset=$charset");
 }
 
 @header("Etag: $checksum");
@@ -80,7 +80,7 @@ if ($checksum != $_SERVER['HTTP_IF_NONE_MATCH'])
 	{
 		$data .= "window.it_boot_init();\n";
 		if (!$_REQUEST['script'])
-			$data = sprintf("/*sln:%d*/\n%s/*eln:%d*/", strlen($data), strtr($data, array('%' => "%25", '.' => "%2e", 'e' => "%65", 'i' => "%69")), strlen($data));	# Protect from Firewalls/Proxies altering Javascript source code
+			$data = sprintf("/*sln:%d*/\n%s/*eln:%d*/", grapheme_strlen($data), strtr($data, array('%' => "%25", '.' => "%2e", 'e' => "%65", 'i' => "%69")), grapheme_strlen($data));	# Protect from Firewalls/Proxies altering Javascript source code
 	}
 
 	echo it_untaint($data);
diff --git a/itjs/it.js b/itjs/it.js
index 54fa5c7..b31423e 100644
--- a/itjs/it.js
+++ b/itjs/it.js
@@ -280,6 +280,24 @@ function it_url_encode(str)
 }
 
 /**
+ * Unicode-safe equivalent of unescape()
+ * @param str string URL encoded string to be decoded
+ */
+function it_url_decode(str)
+{
+	str = str.replace(/\+/g, '%20');
+
+	// catch URI malformed errors
+	try {
+		if (window.decodeURIComponent)
+			return decodeURIComponent(str);
+	}
+	catch(e) {}
+
+	return unescape(str);
+}
+
+/**
  * Patch PNG transparency for IE 5.5-6 on the given image
  */
 function it_pngfix(img, w, h, mode)
diff --git a/tests/exec.t b/tests/exec.t
index 689bd26..9425420 100755
--- a/tests/exec.t
+++ b/tests/exec.t
@@ -11,7 +11,7 @@ is(it::shell_command("echo {arg}", array('arg' => '')), "echo ''", "quote empty
 
 foreach (array("", "C", "de_CH", "de_CH.utf8") as $locale) {
 	setlocale(LC_ALL, $locale);
-	$arg = "pre�post";
+	$arg = "preüpost";
 	if (it::match('utf8', $locale))
 		$arg = utf8_encode($arg);
 	is(it::exec("echo " . $arg), $arg . "\n", "exec with umlaut (locale '$locale')");
diff --git a/tests/getopt.t b/tests/getopt.t
index 7a84588..d67738d 100755
--- a/tests/getopt.t
+++ b/tests/getopt.t
@@ -17,7 +17,7 @@ function getopt_ok($argv, $exp, $name)
 	return is($got['argument'], $exp, $name);
 }
 
-foreach (array("" => "blah gnaber", " (umlaute)" => "pre ��post") as $variant => $testarg) {
+foreach (array("" => "blah gnaber", " (umlaute)" => "pre üäpost") as $variant => $testarg) {
 	getopt_ok(array('-a', $testarg), $testarg, "Short version" . $variant);
 	getopt_ok(array('--argument', $testarg), $testarg, "Long version with space" . $variant);
 	getopt_ok(array("--argument=$testarg"), $testarg, "Long version with equal" . $variant);
diff --git a/tests/it.t b/tests/it.t
index 1a308ec..8e4a7e2 100755
--- a/tests/it.t
+++ b/tests/it.t
@@ -3,252 +3,317 @@
 
 # Tests for it.class
 
-function match($regex, $string, $expect, $name)
+
+#
+# tests for it::match()
+#
+$oldcharset = ini_get('default_charset');
+$oldlocale = setlocale(LC_CTYPE, 0);
+
+ini_set('default_charset', 'utf-8');
+setlocale(LC_CTYPE, 'de_CH');		# required becuase we're checking German umlauts in latin1 mode
+
+
+function match($regex, $string, $expect, $name, $p = array())
 {
 	$GLOBALS['TEST_MORE_LEVEL'] = 1;
-	$pass = is (it::match($regex, $string), $expect, $name);
+	$pass = is (it::match($regex, $string, $p), $expect, $name);
 	if (!$pass) {
-		diag("        regex given: $regex");
+		diag("        regex given: $regex" . ($p ? " " .D($p) : ""));
 		diag("    regex converted: " . it::convertregex($regex));
 	} 
 	$GLOBALS['TEST_MORE_LEVEL'] = 0;
 }
 
+
 match(
 	'b', 'aaaabaaaa',
 	'b',
 	'simple regex'
-	);
+);
+
 match(
 	'a/b', '   a/b   ',
 	'a/b',
 	'regex with /'
 );
+
 match(
 	'aa(bb)aa(cc)aa(dd)qq', 'aabbaaccaaddqq',
-	array( 'bb', 'cc', 'dd' ),
+	array('bb', 'cc', 'dd'),
 	'return array of captures'
-	);
+);
+
 match(
 	'\bblah\b', ' blah ',
 	'blah',
 	'match \b at spaces'
-	);
+);
+
 match(
 	'\bblah\b', 'blah',
 	'blah',
 	'match \b at end of string'
-	);
+);
+
 match(
 	'\bblah\b', 'ablahc',
 	false,
 	'don\'t match \b at word chars'
-	);
+);
+
 match(
-	'\bblah\b', '�blah�',
+	'\bblah\b', 'Üblahä',
 	false,
-	'don\'t match \b at umlaute in latin1'
-	);
+	'don\'t match \b at umlaute'
+);
+
 match(
 	'\Bblah\B', ' blah ',
 	false,
 	'don\'t match \B at spaces'
-	);
+);
+
 match(
 	'\Bblah\B', 'blah',
 	false,
 	'don\'t match \B at end of string'
-	);
+);
+
 match(
 	'\Bblah\B', 'ablahc',
 	'blah',
 	'match \B at word chars'
-	);
+);
+
 match(
-	'\Bblah\B', '�blah�',
+	'\Bblah\B', 'Üblahä',
 	'blah',
-	'match \B at umlaute in latin1'
-	);
+	'match \B at umlaute'
+);
+
 match(
-	'\w+', '  |#�blah�   ',
-	'�blah�',
+	'\w+', '  |#Üblahä   ',
+	'Üblahä',
 	'include umlaute in \w'
-	);
+);
+
 match(
-	'[[:alpha:]]+', '  |#blah�   ',
-	'blah�',
+	'[[:alpha:]]+', '  |#blahä   ',
+	'blahä',
 	'include umlaute in [[:alpha:]]'
-	);
+);
+
 match(
-	'\W+', '  |#�blah�  ',
+	'\W+', '  |#Üblahä  ',
 	'  |#',
 	'don\'t include umlaute in \W'
-	);
+);
+
 match(
-	'\ba', '�a',
+	'\ba', 'äa',
 	'',
 	'\b must know umlauts'
-	);
+);
 
-eval( '$escapedwordregex = "' . it::convertregex( '\w' ) . '";' );
-$escapedwordregex = preg_replace( '|[\\\\/]|', '', $escapedwordregex );
+match(
+	'aaa\\\\w+', '   aaa\www  ',
+	'aaa\www',
+	'don\'t parse \w in \\\\w at beginning (match)'
+);
+
+match(
+	'aaa\\\\w+', '   aaa\www  ',
+	'aaa\www',
+	'don\'t parse \w in \\\\w after chars (match)'
+);
+
+eval('$escapedwordregex = "' . it::convertregex('\w') . '";');
+$escapedwordregex = preg_replace('|[\\\\/]|', '', $escapedwordregex);
 
 match(
 	'\\\\w+',  $escapedwordregex,
 	false,
 	'don\'t parse \w in \\\\w at beginning (no match)'
-	);
-match(
-	'aaa\\\\w+', '   aaa\www  ',
-	'aaa\www',
-	'don\'t parse \w in \\\\w at beginning (match)'
-	);
+);
+
 match(
 	'aaa\\\\w+', 'aaa' . $escapedwordregex,
 	false,
 	'don\'t parse \w in \\\\w after chars (no match)'
-	);
-match(
-	'aaa\\\\w+', '   aaa\www  ',
-	'aaa\www',
-	'don\'t parse \w in \\\\w after chars (match)'
-	);
+);
+
 match(
 	'\\\\\\\\w+', '\\' . $escapedwordregex,
 	false,
 	'don\'t parse \w in \\\\\\\w (no match)'
-	);
+);
+
 match(
 	'\\\\\\\\w+', '  \\\\www  ',
 	'\\\\www',
 	'don\'t parse \\\\\\\\w as \w (match)'
-	);
+);
+
 match(
 	'[\w]+', '[[[]]]---',
 	false,
 	'replace \w in [\w] correctly (no match)'
-	);
+);
+
 match(
 	'[\w]+', '  \\\\aword[[[]]]   ',
 	'aword',
 	'replace \w in [\w] correctly (match)'
-	);
+);
+
 match(
 	'[\\\\w]+', ' blabergna ',
 	false,
 	'don\'t parse \w in [\\\\w] (no match)'
-	);
+);
+
 match(
 	'[\\\\w]+', '  \\\\worda[[[]',
 	'\\\\w',
 	'don\'t parse \w in [\\\\w] (match)'
-	);
+);
+
 match(
 	'[a\W]+', 'bbbbbbb a a%$+ accccc',
 	' a a%$+ a',
 	'\W in []'
-	);
+);
+
 match(
-	'\\\\\\w+', '  \�blah�  ',
-	'\�blah�',
+	'\\\\\\w+', '  \Üblahä  ',
+	'\Üblahä',
 	'parse \w in \\\\\\w at beginning'
-	);
+);
+
 match(
-	'aaa\\\\\\w+', '  aaa\�blah�  ',
-	'aaa\�blah�',
+	'aaa\\\\\\w+', '  aaa\Üblahä  ',
+	'aaa\Üblahä',
 	'parse \w in \\\\\\w after chars'
-	);
-is(
-	it::replace(
-		array(
-			'regex1' => 'repl1',
-			'regex2' => 'repl2',
-			'regex3' => 'repl3' ),
-		'regex2 regex1 regex3' ),
-	'repl2 repl1 repl3',
-	'test tr regex function'
-	);
-is(
-	it::match( '\w+', 'word1 w�rd2 word_3', array('all' => true )),
-	array( 'word1', 'w�rd2', 'word_3' ),
-	"test match_all function"
-	);
+);
+
+match(
+	'\w+', 'word1 wörd2 word_3',
+	array('word1', 'wörd2', 'word_3'),
+	"test match_all function",
+	array('all' => true)
+);
+
 match(
 	'aBcD', '  aBcD  ',
 	'aBcD',
 	"caseinsensitive is default"
-	);
+);
+
 match(
-	'�', '�',
-	'�',
-	'match umlaute in latin1 case insensitive'
-	);
+	'\w+', 'Müller',
+	'Müller',
+	'\w matches umlaut in utf-8 mode'
+);
 
-is(
-	it::match(utf8_encode('a�B�'), utf8_encode("A�b�"), array('utf8' => true)),
-	utf8_encode('A�b�'),
-	"match utf-8 umlaute in case insensitive"
+match(
+	'M.ller', 'Müller',
+	'Müller',
+	'. matches umlaut in utf-8 mode'
 );
 
-$oldcharset = ini_get('default_charset');
-ini_set('default_charset', 'utf-8');
 match(
-	utf8_encode('a�B�'), utf8_encode('A�b�'),
-	utf8_encode('A�b�'),
-	"match utf-8 umlaute in case insensitive using default_charset"
+	utf8_decode('ö'), utf8_decode('Ö'),
+	utf8_decode('Ö'),
+	'match umlaute in de_CH.latin1 case insensitive',
+	array('utf8' => false)
 );
-is(
-	it::match('a�B�', 'A�b�', array('utf8' => false)),
-	'A�b�',
-	"non-utf-8 override with default_charset=utf-8"
+
+match(
+	utf8_decode('aöBÜ'), utf8_decode('AÖbü'),
+	utf8_decode('AÖbü'),
+	"match umlaute with non-utf-8 override in p",
+	array('utf8' => false)
 );
+
+
 match(
-	'\w+', utf8_encode('M�ller'),
-	utf8_encode('M�ller'),
-	'\w matches umlaut in utf-8 mode'
+	'abc', "aBc",
+	false,
+	"set case sensitivity by parameter",
+	array('casesensitive' => 1),
 );
+
 match(
-	'M.ller', utf8_encode('M�ller'),
-	utf8_encode('M�ller'),
-	'. matches umlaut in utf-8 mode'
+	'\w+', 'word1 wörd2 word_3',
+	array('word1', 'wörd2', 'word_3'),
+	"test all => 1 without captures",
+	array('all' => 1)
 );
-ini_set('default_charset', $oldcharset);
 
-is(
-	it::match( 'abc', "aBc", array('casesensitive' => 1 )),
-	false,
-	"set case sensitivity by parameter"
-	);
+match(
+	'\w+\s+(\d+)', 'word1 12 wörd2 3 word_3 4',
+	array('12', '3', '4'),
+	"test all => 1 with one capture",
+	array('all' => 1)
+);
 
+match(
+	'(\w+)\s+(\d+)', 'word1 12 wörd2 3 word_3 4',
+	array(array('word1', '12'), array('wörd2', '3'), array('word_3', '4')),
+	"test all => 1 with captures",
+	array('all' => 1)
+);
+
+match(
+	'(\w+)\s+(\d+)', 'word1 12 wörd2 3 word_3 4',
+	array(array('word1', 'wörd2', 'word_3'), array('12', '3', '4')),
+	"test all => 1,pattern_order => 1",
+	array('all' => 1, 'pattern_order' => 1)
+);
+
+ini_set('default_charset', 'iso-8859-1');
+match(
+	'aöBÜ', "AÖbü",
+	'AÖbü',
+	"match utf-8 umlaute in case insensitive mode with utf8 override",
+	array('utf8' => true)
+);
+ini_set('default_charset', 'utf-8');
+
+
+#
+# tests for it::replace()
+#
 is(
-	it::match( '\w+', 'word1 w�rd2 word_3', array('all' => 1 )),
-	array( 'word1', 'w�rd2', 'word_3' ),
-	"test all=>1 without captures"
-	);
-is(
-	it::match( '\w+\s+(\d+)', 'word1 12 w�rd2 3 word_3 4', array('all' => 1 )),
-	array( '12', '3', '4' ),
-	"test all=>1 with one capture"
-	);
-is(
-	it::match( '(\w+)\s+(\d+)', 'word1 12 w�rd2 3 word_3 4', array('all' => 1 )),
-	array( array( 'word1', '12' ), array( 'w�rd2', '3' ), array( 'word_3', '4' ) ),
-	"test all=>1 with captures"
-	);
-is(
-	it::match( '(\w+)\s+(\d+)', 'word1 12 w�rd2 3 word_3 4', array('all' => 1, 'pattern_order' => 1 )),
-	array( array( 'word1', 'w�rd2', 'word_3' ), array( '12', '3', '4' ) ),
-	"test all=>1,pattern_order=>1"
-	);
+	it::replace(
+		array(
+			'regex1' => 'repl1',
+			'regex2' => 'repl2',
+			'regex3' => 'repl3'),
+		'regex2 regex1 regex3'),
+	'repl2 repl1 repl3',
+	'test tr regex function'
+);
 
 is(it::replace(array('a' => "1", 'b' => "2"), "ab"), "12");
 is(it::replace(array('!' => "x"), "!"), "x");
-is(it::replace(array('\w' => "x"), "o�"), "xx");
-is(it::replace(array('[[:alpha:]]' => "x"), "�"), "x");
-is(it::replace(array('\w' => "x", '#' => "!"), "#�"), "!x");
-is(it::replace(array('#' => "!", '\w' => "x"), "#�"), "!x");
-is(it::replace(array('�' => "x"), "�"), "x");
+is(it::replace(array('\w' => "x"), "oö"), "xx");
+is(it::replace(array('[[:alpha:]]' => "x"), "ö"), "x");
+is(it::replace(array('\w' => "x", '#' => "!"), "#ö"), "!x");
+is(it::replace(array('#' => "!", '\w' => "x"), "#ö"), "!x");
+is(it::replace(array('ö' => "x"), "Ö"), "x");
 is(it::replace(array('a' => "1"), "aaa", array('limit' => 1)), "1aa");
+is(it::replace(array('\s' => "x"), it_html::entity_decode("&nbsp;")), "x", "match non-breaking space as white-space character");
+
+is(it::grep('ismatch', array('ismatch', 'isnomatch')), array('ismatch'),  'grep with simple regex');
+is(it::grep('!', array('ismatch!', 'isnomatch')),      array('ismatch!'), '! in regex');
+is(it::grep('lower|UPPER', array('lower', 'LOWER', 'upper', 'UPPER'), 'casesensitive' => 1), array(0 => 'lower', 3 => 'UPPER'), 'set casesensitive');
+is(it::grep('match', array('foo' => 'match', 'bar' => 'gna')), array('foo' => 'match'), 'with keys');
+
+setlocale(LC_CTYPE, $oldlocale);
+ini_set('default_charset', $oldcharset);	# end of tests that must run with specific charset
+
 
 # it::filter_keys tests
 
@@ -282,4 +347,26 @@ is(it::date('datetime', 1000000.543), it::date('datetime', "1000000"), '... larg
 is(it::date('time', "10.5"), "10:05", 'interpret string with points with strtotime');
 is(it::date('time', "10.05"), "10:05", 'interpret string with points with strtotime');
 
+# it::uc*
+is(it::ucfirst('foo bär über'), 'Foo bär über');
+is(it::ucwords('foo bär über'), 'Foo Bär Über');
+
+# it::substr_replace
+is(it::substr_replace('abcdefgh', 'xyz', 2, 4), substr_replace('abcdefgh', 'xyz', 2, 4), 'it::substr_replace the same as substr_replace for ascii');
+is(it::substr_replace('✔☯♥', '☃☃', 1, 1), '✔☃☃♥', 'it::substr_replace for utf-8');
+
+is(it::any2utf8('Meier'), 'Meier', "it::any2utf8 ascii input");
+is(it::any2utf8('Müller'), 'Müller', "it::any2utf8 utf8 input");
+is(it::any2utf8('Aslı'), 'Aslı', "it::any2utf8 utf8 non-latin1 input");
+is(it::any2utf8(utf8_decode('Müller')), 'Müller', "it::any2utf8 latin1 input");
+
+is(it::any2utf8(
+	' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'),
+	' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ',
+	"it::any2utf8 utf8 input (exhaustive alphabet)"); 
+is(it::any2utf8(
+	utf8_decode(' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ')),
+	' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ',
+	"it::any2utf8 latin1 input (exhaustive alphabet)"); 
+
 ?>
diff --git a/tests/it_html.t b/tests/it_html.t
index 0def431..174c487 100755
--- a/tests/it_html.t
+++ b/tests/it_html.t
@@ -4,6 +4,7 @@