it::convertregex() sets /u if default_charset is 'utf-8' (can be overridden)

author: Christian Weber 2012-02-29 10:12:11 +0000
committer: Christian Weber 2012-02-29 10:12:11 +0000
commit: c0938f79a452c44e28fccc326775891ba6c743e3 (patch)
tree: 98db80457a9bb72f2c80621d3c36cbaa020b279a
parent: 4d945c4a8ced3835064592f35ce2d68083e51b09 (diff)
download: itools-c0938f79a452c44e28fccc326775891ba6c743e3.tar.gz
itools-c0938f79a452c44e28fccc326775891ba6c743e3.tar.bz2
itools-c0938f79a452c44e28fccc326775891ba6c743e3.zip
2 files changed, 39 insertions, 24 deletions
diff --git a/it.class b/it.class
index 818c9e0..292c154 100644
--- a/it.class
+++ b/it.class
@@ -310,35 +310,24 @@ static function toascii($text)
  * @param $p['casesensitive'] Regex is case sensitive (omit modifier i)
  * @param $p['multiline'] add modifier m: ^ and $ match \n
  * @param $p['singleline'] add modifier s: . matches \n
- * @param $p['utf8'] add modifier u
+ * @param $p['utf8'] add modifier u. This is the default if default_charset is utf-8, override with $p['utf8'] = false
  * @param $p['extended'] add modifier x (non signifcant whitespace)
  * @return converted regex to use with preg
  */
 static function convertregex($pattern, $p = array())
 {
-	$pattern = preg_replace('|/|', '\/', $pattern); 
-	$modifiers = '';
-
-	if (!$p['casesensitive'])
-		$modifiers .= 'i';
-
 	if ($p['exec'])
 		it::fatal("Option exec to it::replace has been removed for security reasons");
 
-	foreach (array(
-			'multiline'  => 'm',
-			'singleline' => 's',
-			'utf8'       => 'u',
-			'extended'   => 'x',
-		) as $key => $mod)
-	{
-		if ($p[$key])
-			$modifiers .= $mod;
-	}
-
-	return  "/$pattern/$modifiers";
+	return '/' . strtr($pattern, array('/' => '\/')) . '/' .
+		(!$p['casesensitive'] ? 'i' : '') .
+		($p['multiline'] ? 'm' : '') .
+		($p['singleline'] ? 's' : '') .
+		($p['extended'] ? 'x' : '') .
+		((!isset($p['utf8']) && ini_get('default_charset') == 'utf-8' || $p['utf8']) ? 'u' : '');
 }
 
+
 /**
  * Try to match string against regex. Case insensitive by default.
  * @param $pattern Regex to match against
diff --git a/tests/it.t b/tests/it.t
index 67160d0..c0efa2c 100755
--- a/tests/it.t
+++ b/tests/it.t
@@ -3,13 +3,13 @@
 
 # Tests for it.class
 
-function match( $regex, $string, $exp, $name )
+function match($regex, $string, $expect, $name)
 {
 	$GLOBALS['TEST_MORE_LEVEL'] = 1;
-	$pass = is( it::match( $regex, $string ), $exp, $name );
-	if( !$pass ) {
-		diag( "        regex given: $regex" );
-		diag( "    regex converted: " . it::convertregex( $regex ) );
+	$pass = is (it::match($regex, $string), $expect, $name);
+	if (!$pass) {
+		diag("        regex given: $regex");
+		diag("    regex converted: " . it::convertregex($regex));
 	} 
 	$GLOBALS['TEST_MORE_LEVEL'] = 0;
 }
@@ -183,6 +183,32 @@ match(
 	'Ö',
 	'match umlaute in latin1 case insensitive'
 	);
+
+is(
+	it::match(utf8_encode('aöBÜ'), utf8_encode("AÖbü"), array('utf8' => true)),
+	utf8_encode('AÖbü'),
+	"match utf-8 umlaute in case insensitive"
+);
+
+$oldcharset = ini_get('default_charset');
+ini_set('default_charset', 'utf-8');
+match(
+	utf8_encode('aöBÜ'), utf8_encode('AÖbü'),
+	utf8_encode('AÖbü'),
+	"match utf-8 umlaute in case insensitive using default_charset"
+);
+is(
+	it::match('aöBÜ', 'AÖbü', array('utf8' => false)),
+	'AÖbü',
+	"non-utf-8 override with default_charset=utf-8"
+);
+match(
+	'\w+', utf8_encode('Müller'),
+	utf8_encode('Müller'),
+	'\w matches umlaut in utf-8 mode'
+);
+ini_set('default_charset', $oldcharset);
+
 is(
 	it::match( 'abc', "aBc", array('casesensitive' => 1 )),
 	false,
author	Christian Weber	2012-02-29 10:12:11 +0000
committer	Christian Weber	2012-02-29 10:12:11 +0000
commit	c0938f79a452c44e28fccc326775891ba6c743e3 (patch)
tree	98db80457a9bb72f2c80621d3c36cbaa020b279a
parent	4d945c4a8ced3835064592f35ce2d68083e51b09 (diff)
download	itools-c0938f79a452c44e28fccc326775891ba6c743e3.tar.gz itools-c0938f79a452c44e28fccc326775891ba6c743e3.tar.bz2 itools-c0938f79a452c44e28fccc326775891ba6c743e3.zip