KDECore
nsUniversalDetector.cpp
Go to the documentation of this file.00001 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 00002 /* -*- C++ -*- 00003 * Copyright (C) 1998 <developer@mozilla.org> 00004 * Copyright (C) 2008 <wkai@gmail.com> 00005 * 00006 * Permission is hereby granted, free of charge, to any person obtaining 00007 * a copy of this software and associated documentation files (the 00008 * "Software"), to deal in the Software without restriction, including 00009 * without limitation the rights to use, copy, modify, merge, publish, 00010 * distribute, sublicense, and/or sell copies of the Software, and to 00011 * permit persons to whom the Software is furnished to do so, subject to 00012 * the following conditions: 00013 * 00014 * The above copyright notice and this permission notice shall be included 00015 * in all copies or substantial portions of the Software. 00016 * 00017 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 00018 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 00019 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 00020 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 00021 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 00022 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 00023 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 00024 */ 00025 00026 #include "nsUniversalDetector.h" 00027 00028 #include "nsMBCSGroupProber.h" 00029 #include "nsSBCSGroupProber.h" 00030 #include "nsEscCharsetProber.h" 00031 #include "nsLatin1Prober.h" 00032 00033 namespace kencodingprober { 00034 nsUniversalDetector::nsUniversalDetector() 00035 { 00036 mDone = false; 00037 mBestGuess = -1; //illegal value as signal 00038 mInTag = false; 00039 mEscCharSetProber = 0; 00040 00041 mStart = true; 00042 mDetectedCharset = 0; 00043 mGotData = false; 00044 mInputState = ePureAscii; 00045 mLastChar = '\0'; 00046 00047 unsigned int i; 00048 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 00049 mCharSetProbers[i] = 0; 00050 } 00051 00052 nsUniversalDetector::~nsUniversalDetector() 00053 { 00054 for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 00055 if (mCharSetProbers[i]) 00056 delete mCharSetProbers[i]; 00057 if (mEscCharSetProber) 00058 delete mEscCharSetProber; 00059 } 00060 00061 void 00062 nsUniversalDetector::Reset() 00063 { 00064 mDone = false; 00065 mBestGuess = -1; //illegal value as signal 00066 mInTag = false; 00067 00068 mStart = true; 00069 mDetectedCharset = 0; 00070 mGotData = false; 00071 mInputState = ePureAscii; 00072 mLastChar = '\0'; 00073 00074 if (mEscCharSetProber) 00075 mEscCharSetProber->Reset(); 00076 00077 unsigned int i; 00078 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 00079 if (mCharSetProbers[i]) 00080 mCharSetProbers[i]->Reset(); 00081 } 00082 00083 //--------------------------------------------------------------------- 00084 #define SHORTCUT_THRESHOLD (float)0.95 00085 #define MINIMUM_THRESHOLD (float)0.20 00086 00087 nsProbingState nsUniversalDetector::HandleData(const char* aBuf, unsigned int aLen) 00088 { 00089 if(mDone) 00090 return eFoundIt; 00091 00092 if (aLen > 0) 00093 mGotData = true; 00094 00095 unsigned int i; 00096 for (i = 0; i < aLen; i++) 00097 { 00098 //other than 0xa0, if every othe character is ascii, the page is ascii 00099 if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') //Since many Ascii only page contains NBSP 00100 { 00101 //we got a non-ascii byte (high-byte) 00102 if (mInputState != eHighbyte) 00103 { 00104 //adjust state 00105 mInputState = eHighbyte; 00106 00107 //kill mEscCharSetProber if it is active 00108 if (mEscCharSetProber) { 00109 delete mEscCharSetProber; 00110 mEscCharSetProber = 0; 00111 } 00112 00113 //start multibyte and singlebyte charset prober 00114 if (0 == mCharSetProbers[0]) 00115 mCharSetProbers[0] = new nsMBCSGroupProber; 00116 if (0 == mCharSetProbers[1]) 00117 mCharSetProbers[1] = new nsSBCSGroupProber; 00118 if (0 == mCharSetProbers[2]) 00119 mCharSetProbers[2] = new nsLatin1Prober; 00120 } 00121 } 00122 else 00123 { 00124 //ok, just pure ascii so far 00125 if ( ePureAscii == mInputState && 00126 (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) ) 00127 { 00128 //found escape character or HZ "~{" 00129 mInputState = eEscAscii; 00130 } 00131 00132 mLastChar = aBuf[i]; 00133 } 00134 } 00135 00136 nsProbingState st = eDetecting; 00137 switch (mInputState) 00138 { 00139 case eEscAscii: 00140 if (0 == mEscCharSetProber) { 00141 mEscCharSetProber = new nsEscCharSetProber; 00142 } 00143 st = mEscCharSetProber->HandleData(aBuf, aLen); 00144 if (st == eFoundIt) 00145 { 00146 mDone = true; 00147 mDetectedCharset = mEscCharSetProber->GetCharSetName(); 00148 } 00149 break; 00150 case eHighbyte: 00151 for (i = 0; i < NUM_OF_CHARSET_PROBERS; ++i) 00152 { 00153 st = mCharSetProbers[i]->HandleData(aBuf, aLen); 00154 if (st == eFoundIt) 00155 { 00156 mDone = true; 00157 mDetectedCharset = mCharSetProbers[i]->GetCharSetName(); 00158 } 00159 } 00160 break; 00161 00162 default: //pure ascii 00163 mDetectedCharset = "UTF-8"; 00164 } 00165 return st; 00166 } 00167 00168 00169 //--------------------------------------------------------------------- 00170 const char* nsUniversalDetector::GetCharSetName() 00171 { 00172 if (mDetectedCharset) 00173 return mDetectedCharset; 00174 switch (mInputState) 00175 { 00176 case eHighbyte: 00177 { 00178 float proberConfidence; 00179 float maxProberConfidence = (float)0.0; 00180 int maxProber = 0; 00181 00182 for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 00183 { 00184 proberConfidence = mCharSetProbers[i]->GetConfidence(); 00185 if (proberConfidence > maxProberConfidence) 00186 { 00187 maxProberConfidence = proberConfidence; 00188 maxProber = i; 00189 } 00190 } 00191 //do not report anything because we are not confident of it, that's in fact a negative answer 00192 if (maxProberConfidence > MINIMUM_THRESHOLD) 00193 return mCharSetProbers[maxProber]->GetCharSetName(); 00194 } 00195 case eEscAscii: 00196 break; 00197 default: // pure ascii 00198 ; 00199 } 00200 return "UTF-8"; 00201 00202 } 00203 00204 //--------------------------------------------------------------------- 00205 float nsUniversalDetector::GetConfidence() 00206 { 00207 if (!mGotData) 00208 { 00209 // we haven't got any data yet, return immediately 00210 // caller program sometimes call DataEnd before anything has been sent to detector 00211 return MINIMUM_THRESHOLD; 00212 } 00213 if (mDetectedCharset) 00214 return 0.99f; 00215 switch (mInputState) 00216 { 00217 case eHighbyte: 00218 { 00219 float proberConfidence; 00220 float maxProberConfidence = (float)0.0; 00221 int maxProber = 0; 00222 00223 for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 00224 { 00225 proberConfidence = mCharSetProbers[i]->GetConfidence(); 00226 if (proberConfidence > maxProberConfidence) 00227 { 00228 maxProberConfidence = proberConfidence; 00229 maxProber = i; 00230 } 00231 } 00232 //do not report anything because we are not confident of it, that's in fact a negative answer 00233 if (maxProberConfidence > MINIMUM_THRESHOLD) 00234 return mCharSetProbers[maxProber]->GetConfidence(); 00235 } 00236 case eEscAscii: 00237 break; 00238 default: // pure ascii 00239 ; 00240 } 00241 return MINIMUM_THRESHOLD; 00242 } 00243 00244 nsProbingState nsUniversalDetector::GetState() 00245 { 00246 if (mDone) 00247 return eFoundIt; 00248 else 00249 return eDetecting; 00250 } 00251 } 00252 00253