00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include "kcharsets.h"
00022
00023 #include "kfilterdev.h"
00024 #include "kentities.c"
00025
00026 #include "kconfig.h"
00027 #include "kdebug.h"
00028 #include "kglobal.h"
00029 #include "klocale.h"
00030
00031 #include <QtCore/QDir>
00032 #include <QtCore/QRegExp>
00033 #include <QtCore/QCharRef>
00034 #include <QtCore/QMutableStringListIterator>
00035 #include <QtCore/QTextCodec>
00036
00037 #include <assert.h>
00038 #include <QHash>
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088
00089
00090
00091
00092
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107
00108
00109
00110
00111
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135
00136
00137
00138
00139
00140
00141
00142
00143
00144
00145
00146
00147
00148
00149
00150
00151
00152
00153 static const char language_for_encoding_string[] =
00154 "ISO 8859-1\0"
00155 I18N_NOOP2("@item Text character set", "Western European")"\0"
00156 "ISO 8859-15\0"
00157 "ISO 8859-14\0"
00158 "cp 1252\0"
00159 "IBM850\0"
00160 "ISO 8859-2\0"
00161 I18N_NOOP2("@item Text character set", "Central European")"\0"
00162 "ISO 8859-3\0"
00163 "ISO 8859-4\0"
00164 I18N_NOOP2("@item Text character set", "Baltic")"\0"
00165 "ISO 8859-13\0"
00166 "ISO 8859-16\0"
00167 I18N_NOOP2("@item Text character set", "South-Eastern Europe")"\0"
00168 "cp 1250\0"
00169 "cp 1254\0"
00170 I18N_NOOP2("@item Text character set", "Turkish")"\0"
00171 "cp 1257\0"
00172 "KOI8-R\0"
00173 I18N_NOOP2("@item Text character set", "Cyrillic")"\0"
00174 "ISO 8859-5\0"
00175 "cp 1251\0"
00176 "KOI8-U\0"
00177 "IBM866\0"
00178 "Big5\0"
00179 I18N_NOOP2("@item Text character set", "Chinese Traditional")"\0"
00180 "Big5-HKSCS\0"
00181 "GB18030\0"
00182 I18N_NOOP2("@item Text character set", "Chinese Simplified")"\0"
00183 "GBK\0"
00184 "GB2312\0"
00185 "EUC-KR\0"
00186 I18N_NOOP2("@item Text character set", "Korean")"\0"
00187 "sjis\0"
00188 I18N_NOOP2("@item Text character set", "Japanese")"\0"
00189 "jis7\0"
00190 "EUC-JP\0"
00191 "ISO 8859-7\0"
00192 I18N_NOOP2("@item Text character set", "Greek")"\0"
00193 "cp 1253\0"
00194 "ISO 8859-6\0"
00195 I18N_NOOP2("@item Text character set", "Arabic")"\0"
00196 "cp 1256\0"
00197 "ISO 8859-8\0"
00198 I18N_NOOP2("@item Text character set", "Hebrew")"\0"
00199 "ISO 8859-8-I\0"
00200 "cp 1255\0"
00201 "ISO 8859-9\0"
00202 "TIS620\0"
00203 I18N_NOOP2("@item Text character set", "Thai")"\0"
00204 "ISO 8859-11\0"
00205 "UTF-8\0"
00206 I18N_NOOP2("@item Text character set", "Unicode")"\0"
00207 "UTF-16\0"
00208 "utf7\0"
00209 "ucs2\0"
00210 "ISO 10646-UCS-2\0"
00211 "winsami2\0"
00212 I18N_NOOP2("@item Text character set", "Northern Saami")"\0"
00213 "windows-1258\0"
00214 I18N_NOOP2("@item Text character set", "Other")"\0"
00215 "IBM874\0"
00216 "TSCII\0"
00217 "\0";
00218
00219 static const int language_for_encoding_indices[] = {
00220 0, 11, 28, 11, 40, 11, 52, 11,
00221 60, 11, 67, 78, 95, 78, 106, 117,
00222 124, 117, 136, 148, 169, 78, 177, 185,
00223 193, 117, 201, 208, 217, 208, 228, 208,
00224 236, 208, 243, 208, 250, 255, 275, 255,
00225 286, 294, 313, 294, 317, 294, 324, 331,
00226 338, 343, 352, 343, 357, 343, 364, 375,
00227 381, 375, 389, 400, 407, 400, 415, 426,
00228 433, 426, 446, 426, 454, 185, 465, 472,
00229 477, 472, 489, 495, 503, 495, 510, 495,
00230 515, 495, 520, 495, 536, 545, 560, 573,
00231 579, 573, 586, 573, -1
00232 };
00233
00234
00235
00236
00237
00238
00239
00240
00241
00242
00243
00244
00245
00246
00247
00248
00249
00250
00251
00252
00253
00254
00255
00256
00257
00258
00259
00260
00261
00262
00263
00264
00265
00266
00267
00268
00269
00270
00271
00272
00273
00274
00275
00276
00277
00278
00279
00280
00281
00282
00283
00284
00285
00286
00287
00288
00289
00290
00291
00292
00293
00294
00295
00296
00297
00298
00299
00300
00301
00302
00303
00304
00305
00306
00307
00308
00309
00310
00311
00312
00313
00314
00315
00316
00317
00318
00319
00320
00321
00322
00323
00324
00325
00326
00327
00328
00329
00330
00331
00332
00333
00334 static const char builtin_string[] =
00335 "iso-ir-111\0"
00336 "koi8-r\0"
00337 "koi unified\0"
00338 "us-ascii\0"
00339 "iso 8859-1\0"
00340 "usascii\0"
00341 "ascii\0"
00342 "unicode-1-1-utf-7\0"
00343 "utf-7\0"
00344 "ucs2\0"
00345 "iso-10646-ucs-2\0"
00346 "iso10646-1\0"
00347 "gb18030.2000-1\0"
00348 "gb18030\0"
00349 "gb18030.2000-0\0"
00350 "gbk-0\0"
00351 "gbk\0"
00352 "gb2312\0"
00353 "gb2312.1980-0\0"
00354 "big5-0\0"
00355 "big5\0"
00356 "euc-kr\0"
00357 "euckr\0"
00358 "euc-jp\0"
00359 "eucjp\0"
00360 "jisx0201.1976-0\0"
00361 "jisx0208.1983-0\0"
00362 "jisx0208.1990-0\0"
00363 "jisx0208.1997-0\0"
00364 "jisx0212.1990-0\0"
00365 "jisx0213.2000-1\0"
00366 "jisx0213.2000-2\0"
00367 "shift_jis\0"
00368 "sjis\0"
00369 "shift-jis\0"
00370 "iso-2022-jp\0"
00371 "jis7\0"
00372 "windows850\0"
00373 "ibm850\0"
00374 "windows866\0"
00375 "ibm866\0"
00376 "windows-850\0"
00377 "windows-866\0"
00378 "cp-10000\0"
00379 "apple roman\0"
00380 "thai-tis620\0"
00381 "iso 8859-11\0"
00382 "windows-874\0"
00383 "ibm874\0"
00384 "windows874\0"
00385 "cp-874\0"
00386 "ksc5601.1987-0\0"
00387 "ks_c_5601-1987\0"
00388 "mac-roman\0"
00389 "macintosh\0"
00390 "mac\0"
00391 "csiso2022jp\0"
00392 "\0";
00393
00394 static const int builtin_indices[] = {
00395 0, 11, 18, 11, 30, 39, 50, 39,
00396 58, 39, 64, 82, 88, 93, 109, 93,
00397 120, 135, 143, 135, 158, 164, 168, 164,
00398 175, 164, 189, 196, 201, 208, 214, 221,
00399 227, 221, 243, 221, 259, 221, 275, 221,
00400 291, 221, 307, 221, 323, 221, 339, 349,
00401 354, 349, 349, 349, 364, 376, 381, 392,
00402 399, 410, 417, 392, 429, 410, 441, 450,
00403 462, 474, 486, 498, 505, 498, 516, 498,
00404 523, 208, 538, 208, 553, 450, 563, 450,
00405 573, 450, 577, 364, -1
00406 };
00407
00408 #if 0
00409
00410
00411
00412
00413
00414
00415
00416
00417
00418
00419
00420
00421
00422
00423
00424
00425 static const char aliases_string[] =
00426 "cp852\0"
00427 "ibm852\0"
00428 "cp-852\0"
00429 "x-cp-852\0"
00430 "windows852\0"
00431 "windows-852\0"
00432 "x-windows-852\0"
00433 "\0";
00434
00435 static const int aliases_indices[] = {
00436 0, 6, 13, 6, 20, 6, 29, 6,
00437 40, 6, 52, 6, -1
00438 };
00439 #endif
00440
00441
00442
00443
00444
00445
00446
00447
00448
00449
00450
00451
00452
00453
00454
00455
00456
00457
00458
00459
00460
00461
00462
00463
00464
00465 static const char conversion_hints_string[] =
00466 "cp1250\0"
00467 "iso-8859-2\0"
00468 "koi8-r\0"
00469 "iso-8859-5\0"
00470 "koi8-u\0"
00471 "pt 154\0"
00472 "windows-1251\0"
00473 "paratype-154\0"
00474 "pt-154\0"
00475 "\0";
00476
00477 static const int conversion_hints_indices[] = {
00478 0, 7, 18, 25, 36, 18, 43, 50,
00479 63, 50, 76, 50, -1
00480 };
00481
00482
00483
00484 static inline
00485 const char *kcharsets_array_search(const char *start, const int *indices, const char *entry)
00486 {
00487 for (int i = 0; indices[i] != -1; i += 2)
00488 if (qstrcmp(start + indices[i], entry) == 0)
00489 return start + indices[i + 1];
00490 return 0;
00491 }
00492
00493
00494 class KCharsetsPrivate
00495 {
00496 public:
00497 KCharsetsPrivate(KCharsets* _kc)
00498 {
00499 kc = _kc;
00500 codecForNameDict.reserve( 43 );
00501 }
00502
00503 QHash<QByteArray,QTextCodec*> codecForNameDict;
00504 KCharsets* kc;
00505
00506
00507 QList<QStringList> encodingsByScript;
00508 };
00509
00510
00511
00512 KCharsets::KCharsets()
00513 :d(new KCharsetsPrivate(this))
00514 {
00515 }
00516
00517 KCharsets::~KCharsets()
00518 {
00519 delete d;
00520 }
00521
00522 QChar KCharsets::fromEntity(const QString &str)
00523 {
00524 QChar res = QChar::Null;
00525
00526 if ( str.isEmpty() )
00527 return QChar::Null;
00528
00529 int pos = 0;
00530 if(str[pos] == '&') pos++;
00531
00532
00533 if (str[pos] == '#' && str.length()-pos > 1) {
00534 bool ok;
00535 pos++;
00536 if (str[pos] == 'x' || str[pos] == 'X') {
00537 pos++;
00538
00539 const QString tmp( str.mid( pos ) );
00540 res = tmp.toInt(&ok, 16);
00541 } else {
00542
00543 const QString tmp( str.mid( pos ) );
00544 res = tmp.toInt(&ok, 10);
00545 }
00546 if ( ok )
00547 return res;
00548 else
00549 return QChar::Null;
00550 }
00551
00552 const QByteArray raw ( str.toLatin1() );
00553 const entity *e = kde_findEntity( raw, raw.length() );
00554
00555 if(!e)
00556 {
00557
00558 return QChar::Null;
00559 }
00560
00561
00562 return QChar(e->code);
00563 }
00564
00565 QChar KCharsets::fromEntity(const QString &str, int &len)
00566 {
00567
00568
00569 len = 8;
00570 while(len > 0)
00571 {
00572 QString tmp = str.left(len);
00573 QChar res = fromEntity(tmp);
00574 if( res != QChar::Null ) return res;
00575 len--;
00576 }
00577 return QChar::Null;
00578 }
00579
00580
00581 QString KCharsets::toEntity(const QChar &ch)
00582 {
00583 QString ent;
00584 ent.sprintf("�x%x;", ch.unicode());
00585 return ent;
00586 }
00587
00588 QString KCharsets::resolveEntities( const QString &input )
00589 {
00590 QString text = input;
00591 const QChar *p = text.unicode();
00592 const QChar *end = p + text.length();
00593 const QChar *ampersand = 0;
00594 bool scanForSemicolon = false;
00595
00596 for ( ; p < end; ++p ) {
00597 const QChar ch = *p;
00598
00599 if ( ch == '&' ) {
00600 ampersand = p;
00601 scanForSemicolon = true;
00602 continue;
00603 }
00604
00605 if ( ch != ';' || scanForSemicolon == false )
00606 continue;
00607
00608 assert( ampersand );
00609
00610 scanForSemicolon = false;
00611
00612 const QChar *entityBegin = ampersand + 1;
00613
00614 const uint entityLength = p - entityBegin;
00615 if ( entityLength == 0 )
00616 continue;
00617
00618 const QChar entityValue = KCharsets::fromEntity( QString( entityBegin, entityLength ) );
00619 if ( entityValue.isNull() )
00620 continue;
00621
00622 const uint ampersandPos = ampersand - text.unicode();
00623
00624 text[ (int)ampersandPos ] = entityValue;
00625 text.remove( ampersandPos + 1, entityLength + 1 );
00626 p = text.unicode() + ampersandPos;
00627 end = text.unicode() + text.length();
00628 ampersand = 0;
00629 }
00630
00631 return text;
00632 }
00633
00634 QStringList KCharsets::availableEncodingNames() const
00635 {
00636 QStringList available;
00637 for ( const int *p = language_for_encoding_indices; *p != -1; p += 2)
00638 available.append( QString::fromUtf8( language_for_encoding_string + *p ) );
00639 available.sort();
00640 return available;
00641 }
00642
00643 QString KCharsets::languageForEncoding( const QString &encoding ) const
00644 {
00645 const char* lang = kcharsets_array_search( (const char*)language_for_encoding_string,
00646 language_for_encoding_indices,
00647 encoding.toUtf8().constData() );
00648 if ( lang )
00649 return i18nc( "@item Text character set", lang );
00650 else
00651 return i18nc( "@item Text character set", "Other" );
00652 }
00653
00654 QString KCharsets::descriptionForEncoding( const QString& encoding ) const
00655 {
00656 const char* lang = kcharsets_array_search( language_for_encoding_string,
00657 language_for_encoding_indices,
00658 encoding.toUtf8() );
00659 if ( lang )
00660 return i18nc( "@item %1 character set, %2 encoding", "%1 ( %2 )",
00661 i18nc( "@item Text character set", lang ), encoding );
00662 else
00663 return i18nc( "@item", "Other encoding (%1)", encoding );
00664 }
00665
00666 QString KCharsets::encodingForName( const QString &descriptiveName ) const
00667 {
00668 const int left = descriptiveName.lastIndexOf( '(' );
00669
00670 if (left<0)
00671 return descriptiveName.trimmed();
00672
00673 QString name(descriptiveName.mid(left+1));
00674
00675 const int right = name.lastIndexOf( ')' );
00676
00677 if (right<0)
00678 return name;
00679
00680 return name.left(right).trimmed();
00681 }
00682
00683 QStringList KCharsets::descriptiveEncodingNames() const
00684 {
00685 QStringList encodings;
00686 for ( const int *p = language_for_encoding_indices; *p != -1; p += 2) {
00687 const QString name = QString::fromUtf8( language_for_encoding_string + p[0] );
00688 const QString description = i18nc( "@item Text character set", language_for_encoding_string + p[1] );
00689 encodings.append( i18nc( "@item Text encoding: %1 character set, %2 encoding", "%1 ( %2 )",
00690 description, name ) );
00691 }
00692 encodings.sort();
00693 return encodings;
00694 }
00695
00696 QList<QStringList> KCharsets::encodingsByScript() const
00697 {
00698 if (!d->encodingsByScript.isEmpty())
00699 return d->encodingsByScript;
00700 int i;
00701 for ( const int *p = language_for_encoding_indices; *p != -1; p += 2) {
00702 const QString name = QString::fromUtf8( language_for_encoding_string + p[0] );
00703 const QString description = i18nc("@item Text character set", language_for_encoding_string + p[1] );
00704
00705 for (i=0; i<d->encodingsByScript.size(); ++i) {
00706 if (d->encodingsByScript.at(i).at(0) == description) {
00707 d->encodingsByScript[i].append(name);
00708 break;
00709 }
00710 }
00711
00712 if (i==d->encodingsByScript.size()) {
00713 d->encodingsByScript.append(QStringList() << description << name);
00714 }
00715
00716 }
00717 return d->encodingsByScript;
00718 }
00719
00720 QTextCodec* KCharsets::codecForName(const QString &n) const
00721 {
00722 const QByteArray name( n.toLatin1() );
00723 QTextCodec* codec = codecForNameOrNull( name );
00724 if ( codec )
00725 return codec;
00726 else
00727 return QTextCodec::codecForName( "iso-8859-1" );
00728 }
00729
00730 QTextCodec* KCharsets::codecForName(const QString &n, bool &ok) const
00731 {
00732 const QByteArray name( n.toLatin1() );
00733 QTextCodec* codec = codecForNameOrNull( name );
00734 if ( codec )
00735 {
00736 ok = true;
00737 return codec;
00738 }
00739 else
00740 {
00741 ok = false;
00742 return QTextCodec::codecForName( "iso-8859-1" );
00743 }
00744 }
00745
00746 QTextCodec *KCharsets::codecForNameOrNull( const QByteArray& n ) const
00747 {
00748 QTextCodec* codec = 0;
00749
00750 if (n.isEmpty()) {
00751
00752 const QByteArray locale = "->locale<-";
00753 if ( d->codecForNameDict.contains( locale ) )
00754 return d->codecForNameDict.value( locale );
00755 codec = KGlobal::locale()->codecForEncoding();
00756 d->codecForNameDict.insert("->locale<-", codec);
00757 return codec;
00758 }
00759
00760 else if ( d->codecForNameDict.contains( n ) ) {
00761 return d->codecForNameDict.value( n );
00762 }
00763
00764
00765
00766 codec = QTextCodec::codecForName( n );
00767 if ( codec ) {
00768 d->codecForNameDict.insert( n, codec );
00769 return codec;
00770 }
00771
00772
00773
00774 QByteArray name = n.toLower();
00775 bool changed = false;
00776 if (name.endsWith("_charset")) {
00777 name.chop( 8 );
00778 changed = true;
00779 }
00780 if ( name.startsWith( "x-" ) ) {
00781 name.remove( 0, 2 );
00782 changed = true;
00783 }
00784
00785 if (name.isEmpty()) {
00786
00787 return 0;
00788 }
00789
00790
00791 if ( changed ) {
00792 codec = QTextCodec::codecForName(name);
00793 if (codec) {
00794 d->codecForNameDict.insert( n, codec );
00795 return codec;
00796 }
00797 changed = false;
00798 }
00799
00800
00801
00802 QByteArray cname = kcharsets_array_search( builtin_string, builtin_indices, name);
00803
00804 if(!cname.isEmpty())
00805 codec = QTextCodec::codecForName(cname);
00806
00807 if (codec)
00808 {
00809 d->codecForNameDict.insert( n, codec );
00810 return codec;
00811 }
00812
00813 #ifdef __GNUC__
00814 #warning is it still useful with Qt4 ?
00815 #endif
00816
00817
00818
00819
00820 #if 0
00821 QString dir;
00822 {
00823 KConfigGroup cg( KGlobal::config(), "i18n" );
00824 dir = cg.readPathEntry("i18ndir", QLatin1String("/usr/share/i18n/charmaps"));
00825 }
00826
00827
00828
00829 cname = kcharsets_array_search< Aliases, const char* >( aliases, name.data());
00830
00831 if(cname.isEmpty())
00832 cname = name;
00833 cname = cname.toUpper();
00834
00835 const QString basicName = QLatin1String(cname);
00836 kDebug() << endl << " Trying to find " << cname << " in " << dir;
00837
00838 QString charMapFileName;
00839 bool gzipped = false;
00840 QDir qdir(dir);
00841 if (!qdir.exists()) {
00842
00843 }
00844 else if (qdir.exists(basicName, false)) {
00845 charMapFileName = basicName;
00846 }
00847 else if (qdir.exists(basicName+".gz", false)) {
00848 charMapFileName = basicName + ".gz";
00849 gzipped = true;
00850 }
00851 else {
00852
00853
00854
00855 QRegExp regexp("^(X-)?(CP|IBM)(-| )?(0-9)+");
00856 if ( regexp.search(basicName) != -1) {
00857 const QString num = regexp.cap(4);
00858 if (num.isEmpty()) {
00859
00860 }
00861 else if (qdir.exists("IBM"+num)) {
00862 charMapFileName = "IBM"+num;
00863 }
00864 else if (qdir.exists("IBM"+num+".gz")) {
00865 charMapFileName = "IBM"+num+".gz";
00866 gzipped = true;
00867 }
00868 else if (qdir.exists("CP"+num)) {
00869 charMapFileName = "CP"+num;
00870 }
00871 else if (qdir.exists("CP"+num+".gz")) {
00872 charMapFileName = "CP"+num+".gz";
00873 gzipped = true;
00874 }
00875 }
00876 }
00877
00878 if (gzipped && !charMapFileName.isEmpty()) {
00879 KFilterDev gzip(dir + '/' + charMapFileName);
00880 if (gzip.open(QIODevice::ReadOnly)) {
00881 kDebug() << "Loading gzipped charset...";
00882 codec = QTextCodec::loadCharmap(&gzip);
00883 gzip.close();
00884 }
00885 else
00886 kWarning() << "Could not open gzipped charset!";
00887 }
00888 else if (!charMapFileName.isEmpty()) {
00889 codec = QTextCodec::loadCharmapFile(dir + '/' + charMapFileName);
00890 }
00891
00892 if(codec) {
00893 d->codecForNameDict.insert( n, codec );
00894 return codec;
00895 }
00896 #endif
00897
00898
00899
00900 cname = kcharsets_array_search( conversion_hints_string, conversion_hints_indices, name );
00901
00902 if (!cname.isEmpty()) {
00903 codec = QTextCodec::codecForName(cname);
00904 if (codec) {
00905 d->codecForNameDict.insert( n, codec );
00906 return codec;
00907 }
00908 }
00909
00910
00911 return 0;
00912 }