http://pastebin.com/18iNY6dU
- Code: Tout sélectionner
function is_utf8($str) {
$c=0; $b=0;
$bits=0;
$len=strlen($str);
for($i=0; $i<$len; $i++){
$c=ord($str[$i]);
if($c > 128){
if(($c >= 254)) return false;
elseif($c >= 252) $bits=6;
elseif($c >= 248) $bits=5;
elseif($c >= 240) $bits=4;
elseif($c >= 224) $bits=3;
elseif($c >= 192) $bits=2;
else return false;
if(($i+$bits) > $len) return false;
while($bits > 1){
$i++;
$b=ord($str[$i]);
if($b < 128 || $b > 191) return false;
$bits--;
}
}
}
return true;
}
Code for C, c++ :
- Code: Tout sélectionner
// ****************************************************************************
//O is_utf8 ()
// ****************************************************************************
int Mkdcppw::is_utf8(char string[])
{
char ch1=0; char ch2=0;
int noctets=0;
long llen=strlen(string); long li;
for(li=0; li<llen; li++)
{
ch1=string[li]; if(ch1 == 0xffffffc3) puts("OK");
printf("%x=%c, ",ch1,ch1); //T test point
if(ch1 >= 0xffffffc2)
{
puts("char is > 128 - ISO or Unicode-utf8 ?"); //T test point
if((ch1 >= 0xfffffffe))
{
puts("char >= OxFe no match for utf8 !");
return 0; // 0xFE
}
else if(ch1 >= 0xfffffffc) noctets=6; // 0xFC ?
else if(ch1 >= 0xfffffff8) noctets=5; // 0xF8 ?
else if(ch1 >= 0xfffffff0) noctets=4; // 0xF0 utf8 de F0 à F4 puis 80 à BF
// si F0 alors entre 90 et BF
// si F4 alors entre 80 et 8F
else if(ch1 >= 0xffffffE0) noctets=3; // OxE0 utf8 de E0 à EF puis 80 à BF
else if(ch1 >= 0xffffffC2) noctets=2; // 0xC0 utf8 de C2 à DF puis 80 à BF
else
{
puts("char is probably ISO");
return 0; // not utf8 if ch1 < OxC0
}
if((li+noctets) > llen) return 0;
printf ("\nchar utf8 with %d octets\n",noctets); //T test point
while(noctets > 1)
{
li++; puts("ultime test : realy utf 8 ?"); //T test point
ch2=string[li];
if(ch2 < 0xffffff80 || ch2 > 0xffffffbf)
{ // Global
puts("char no match for utf8 !"); //T test point
return 0; // entre Ox80 et 0xBF
}
if(ch1==(ch1 == 0xfffffff0) && ch2 < 0xffffff90)
{ // case ch1=0xF0 and ch2<0x90
puts("char no match for utf8 !"); //T test point
return 0; // entre Ox90 et 0xBF
}
if(ch1==(ch1 == 0xfffffff4) && ch2 > 0xffffff8F)
{ // case ch1=0xF4 and ch2>0x8F
puts("char no match for utf8 !"); //T test point
return 0; // entre Ox80 et 0x8F
}
noctets--;
}
}
}
puts("char is utf8");
return 1;
}
extract from man page (7) :
- Code: Tout sélectionner
The UTF-8 encoding has the following nice properties:
* UCS characters 0x00000000 to 0x0000007f (the classic US-ASCII charac‐
ters) are encoded simply as bytes 0x00 to 0x7f (ASCII compatibility).
This means that files and strings which contain only 7-bit ASCII
characters have the same encoding under both ASCII and UTF-8.
Consult unix utf-88 man page.
See also : http://fr.wikipedia.org/wiki/Unicode#D% ... techniques and : http://en.wikipedia.org/wiki/Unicode