• 28th, May 2011

PHP Clean String of UTF8 Chars – Convert to similar ASCII char

	/**
	* Returns an string clean of UTF8 characters. It will convert them to a similar ASCII character
	* www.unexpectedit.com
	*/
function cleanString($text) {
	// 1) convert á ô => a o
	$text = preg_replace("/[áàâãªä]/u","a",$text);
	$text = preg_replace("/[ÁÀÂÃÄ]/u","A",$text);
	$text = preg_replace("/[ÍÌÎÏ]/u","I",$text);
	$text = preg_replace("/[íìîï]/u","i",$text);
	$text = preg_replace("/[éèêë]/u","e",$text);
	$text = preg_replace("/[ÉÈÊË]/u","E",$text);
	$text = preg_replace("/[óòôõºö]/u","o",$text);
	$text = preg_replace("/[ÓÒÔÕÖ]/u","O",$text);
	$text = preg_replace("/[úùûü]/u","u",$text);
	$text = preg_replace("/[ÚÙÛÜ]/u","U",$text);
	$text = preg_replace("/[’‘‹›‚]/u","'",$text);
	$text = preg_replace("/[“”«»„]/u",'"',$text);
	$text = str_replace("–","-",$text);
	$text = str_replace(" "," ",$text);
	$text = str_replace("ç","c",$text);
	$text = str_replace("Ç","C",$text);
	$text = str_replace("ñ","n",$text);
	$text = str_replace("Ñ","N",$text);

	//2) Translation CP1252. – => -
	$trans = get_html_translation_table(HTML_ENTITIES);
	$trans[chr(130)] = '‚';    // Single Low-9 Quotation Mark
	$trans[chr(131)] = 'ƒ';    // Latin Small Letter F With Hook
	$trans[chr(132)] = '„';    // Double Low-9 Quotation Mark
	$trans[chr(133)] = '…';    // Horizontal Ellipsis
	$trans[chr(134)] = '†';    // Dagger
	$trans[chr(135)] = '‡';    // Double Dagger
	$trans[chr(136)] = 'ˆ';    // Modifier Letter Circumflex Accent
	$trans[chr(137)] = '‰';    // Per Mille Sign
	$trans[chr(138)] = 'Š';    // Latin Capital Letter S With Caron
	$trans[chr(139)] = '‹';    // Single Left-Pointing Angle Quotation Mark
	$trans[chr(140)] = 'Œ';    // Latin Capital Ligature OE
	$trans[chr(145)] = '‘';    // Left Single Quotation Mark
	$trans[chr(146)] = '’';    // Right Single Quotation Mark
	$trans[chr(147)] = '“';    // Left Double Quotation Mark
	$trans[chr(148)] = '”';    // Right Double Quotation Mark
	$trans[chr(149)] = '•';    // Bullet
	$trans[chr(150)] = '–';    // En Dash
	$trans[chr(151)] = '—';    // Em Dash
	$trans[chr(152)] = '˜';    // Small Tilde
	$trans[chr(153)] = '™';    // Trade Mark Sign
	$trans[chr(154)] = 'š';    // Latin Small Letter S With Caron
	$trans[chr(155)] = '›';    // Single Right-Pointing Angle Quotation Mark
	$trans[chr(156)] = 'œ';    // Latin Small Ligature OE
	$trans[chr(159)] = 'Ÿ';    // Latin Capital Letter Y With Diaeresis
	$trans['euro'] = '€';    // euro currency symbol
	ksort($trans); 

	foreach ($trans as $k => $v) {
		$text = str_replace($v, $k, $text);
	}

	// 3) remove <p>, <br/> ...
	$text = strip_tags($text); 

	// 4) &amp; => & &quot; => '
	$text = html_entity_decode($text);

	// 5) remove Windows-1252 symbols like "TradeMark", "Euro"...
	$text = preg_replace('/[^(\x20-\x7F)]*/','', $text); 

	$targets=array('\r\n','\n','\r','\t');
	$results=array(" "," "," ","");
	$text = str_replace($targets,$results,$text);

	return ($text);
} 

Usage:

	$val = "Arômes ... óòôõº ... áéíóú ... Barça ... “Windows quotes” ... this is not a normal space ( ) ... this is not a normal dash (–) ... Esdrújula ... Wünderlist ... &#160; ... &amp; ... & ... &rsquo; ... &ndash; ... &pound; ... &euro; ... &nbsp; ... 	...";
	echo cleanString($val);
	//result: Aromes ... ooooo ... aeiou ... Barca ... "Windows quotes" ... this is not a normal space ( ) ... this is not a normal dash (-) ... Esdrujula ... Wunderlist ... ... & ... & ... ... ... ... euro ... ... ...

	//Note: If you get an empty string, make sure you pass utf8 string to the function
	echo cleanString(utf8_encode($val));

Tags: , , , , , , ,

Leave a Reply

*

© 2010 unexpected[it]. All Rights Reserved.