KDECore
nsHebrewProber.cpp
Go to the documentation of this file.00001 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 00002 /* -*- C++ -*- 00003 * Copyright (C) 1998 <developer@mozilla.org> 00004 * 00005 * 00006 * Permission is hereby granted, free of charge, to any person obtaining 00007 * a copy of this software and associated documentation files (the 00008 * "Software"), to deal in the Software without restriction, including 00009 * without limitation the rights to use, copy, modify, merge, publish, 00010 * distribute, sublicense, and/or sell copies of the Software, and to 00011 * permit persons to whom the Software is furnished to do so, subject to 00012 * the following conditions: 00013 * 00014 * The above copyright notice and this permission notice shall be included 00015 * in all copies or substantial portions of the Software. 00016 * 00017 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 00018 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 00019 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 00020 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 00021 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 00022 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 00023 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 00024 */ 00025 00026 #include "nsHebrewProber.h" 00027 #include <stdio.h> 00028 00029 // windows-1255 / ISO-8859-8 code points of interest 00030 #define FINAL_KAF ('\xea') 00031 #define NORMAL_KAF ('\xeb') 00032 #define FINAL_MEM ('\xed') 00033 #define NORMAL_MEM ('\xee') 00034 #define FINAL_NUN ('\xef') 00035 #define NORMAL_NUN ('\xf0') 00036 #define FINAL_PE ('\xf3') 00037 #define NORMAL_PE ('\xf4') 00038 #define FINAL_TSADI ('\xf5') 00039 #define NORMAL_TSADI ('\xf6') 00040 00041 // Minimum Visual vs Logical final letter score difference. 00042 // If the difference is below this, don't rely solely on the final letter score distance. 00043 #define MIN_FINAL_CHAR_DISTANCE (5) 00044 00045 // Minimum Visual vs Logical model score difference. 00046 // If the difference is below this, don't rely at all on the model score distance. 00047 #define MIN_MODEL_DISTANCE (0.01) 00048 00049 #define VISUAL_HEBREW_NAME ("ISO-8859-8") 00050 #define LOGICAL_HEBREW_NAME ("windows-1255") 00051 00052 namespace kencodingprober { 00053 bool nsHebrewProber::isFinal(char c) 00054 { 00055 return ((c == FINAL_KAF) || (c == FINAL_MEM) || (c == FINAL_NUN) || (c == FINAL_PE) || (c == FINAL_TSADI)); 00056 } 00057 00058 bool nsHebrewProber::isNonFinal(char c) 00059 { 00060 return ((c == NORMAL_KAF) || (c == NORMAL_MEM) || (c == NORMAL_NUN) || (c == NORMAL_PE)); 00061 // The normal Tsadi is not a good Non-Final letter due to words like 00062 // 'lechotet' (to chat) containing an apostrophe after the tsadi. This 00063 // apostrophe is converted to a space in FilterWithoutEnglishLetters causing 00064 // the Non-Final tsadi to appear at an end of a word even though this is not 00065 // the case in the original text. 00066 // The letters Pe and Kaf rarely display a related behavior of not being a 00067 // good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for 00068 // example legally end with a Non-Final Pe or Kaf. However, the benefit of 00069 // these letters as Non-Final letters outweighs the damage since these words 00070 // are quite rare. 00071 } 00072 00098 nsProbingState nsHebrewProber::HandleData(const char* aBuf, unsigned int aLen) 00099 { 00100 // Both model probers say it's not them. No reason to continue. 00101 if (GetState() == eNotMe) 00102 return eNotMe; 00103 00104 const char *curPtr, *endPtr = aBuf+aLen; 00105 char cur; 00106 00107 for (curPtr = (char*)aBuf; curPtr < endPtr; ++curPtr) 00108 { 00109 cur = *curPtr; 00110 if (cur == ' ') // We stand on a space - a word just ended 00111 { 00112 if (mBeforePrev != ' ') // *(curPtr-2) was not a space so prev is not a 1 letter word 00113 { 00114 if (isFinal(mPrev)) // case (1) [-2:not space][-1:final letter][cur:space] 00115 ++mFinalCharLogicalScore; 00116 else if (isNonFinal(mPrev)) // case (2) [-2:not space][-1:Non-Final letter][cur:space] 00117 ++mFinalCharVisualScore; 00118 } 00119 } 00120 else // Not standing on a space 00121 { 00122 if ((mBeforePrev == ' ') && (isFinal(mPrev)) && (cur != ' ')) // case (3) [-2:space][-1:final letter][cur:not space] 00123 ++mFinalCharVisualScore; 00124 } 00125 mBeforePrev = mPrev; 00126 mPrev = cur; 00127 } 00128 00129 // Forever detecting, till the end or until both model probers return eNotMe (handled above). 00130 return eDetecting; 00131 } 00132 00133 // Make the decision: is it Logical or Visual? 00134 const char* nsHebrewProber::GetCharSetName() 00135 { 00136 // If the final letter score distance is dominant enough, rely on it. 00137 int finalsub = mFinalCharLogicalScore - mFinalCharVisualScore; 00138 if (finalsub >= MIN_FINAL_CHAR_DISTANCE) 00139 return LOGICAL_HEBREW_NAME; 00140 if (finalsub <= -(MIN_FINAL_CHAR_DISTANCE)) 00141 return VISUAL_HEBREW_NAME; 00142 00143 // It's not dominant enough, try to rely on the model scores instead. 00144 float modelsub = mLogicalProb->GetConfidence() - mVisualProb->GetConfidence(); 00145 if (modelsub > MIN_MODEL_DISTANCE) 00146 return LOGICAL_HEBREW_NAME; 00147 if (modelsub < -(MIN_MODEL_DISTANCE)) 00148 return VISUAL_HEBREW_NAME; 00149 00150 // Still no good, back to final letter distance, maybe it'll save the day. 00151 if (finalsub < 0) 00152 return VISUAL_HEBREW_NAME; 00153 00154 // (finalsub > 0 - Logical) or (don't know what to do) default to Logical. 00155 return LOGICAL_HEBREW_NAME; 00156 } 00157 00158 00159 void nsHebrewProber::Reset(void) 00160 { 00161 mFinalCharLogicalScore = 0; 00162 mFinalCharVisualScore = 0; 00163 00164 // mPrev and mBeforePrev are initialized to space in order to simulate a word 00165 // delimiter at the beginning of the data 00166 mPrev = ' '; 00167 mBeforePrev = ' '; 00168 } 00169 00170 nsProbingState nsHebrewProber::GetState(void) 00171 { 00172 // Remain active as long as any of the model probers are active. 00173 if ((mLogicalProb->GetState() == eNotMe) && (mVisualProb->GetState() == eNotMe)) 00174 return eNotMe; 00175 return eDetecting; 00176 } 00177 00178 #ifdef DEBUG_PROBE 00179 void nsHebrewProber::DumpStatus() 00180 { 00181 printf(" HEB: %d - %d [Logical-Visual score]\r\n", mFinalCharLogicalScore, mFinalCharVisualScore); 00182 } 00183 #endif 00184 } 00185 00186