Projects
Mega:23.09
uchardet
Sign Up
Log In
Username
Password
We truncated the diff of some files because they were too big. If you want to see the full diff for every file,
click here
.
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
Expand all
Collapse all
Changes of Revision 3
View file
_service:tar_scm:uchardet.spec
Changed
@@ -1,5 +1,5 @@ Name: uchardet -Version: 0.0.6 +Version: 0.0.8 Release: 1 Summary: An encoding detector library ported from Mozilla License: MPLv1.1 @@ -63,7 +63,11 @@ %{_includedir}/%{name}/ %{_libdir}/lib%{name}.so %{_libdir}/pkgconfig/%{name}.pc +%exclude %{_libdir}/cmake/%{name}/*.cmake %changelog +* Thu Oct 19 2023 Ge Wang <wang__ge@126.com> - 0.0.8-1 +- Update to version 0.0.8 + * Mon Mar 2 2020 openEuler Buildteam <buildteam@openeuler.org> - 0.0.6-1 - Package init
View file
_service
Changed
@@ -2,7 +2,7 @@ <service name="tar_scm"> <param name="url">git@gitee.com:src-openeuler/uchardet.git</param> <param name="scm">git</param> - <param name="revision">openEuler-23.09</param> + <param name="revision">master</param> <param name="exclude">*</param> <param name="extract">*</param> </service>
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/.gitignore -> _service:tar_scm:uchardet-0.0.8.tar.xz/.gitignore
Changed
@@ -1,1 +1,38 @@ __pycache__/ + +# CMake files +CMakeCache.txt +CMakeFiles/ +CTestTestfile.cmake +cmake_install.cmake + +# With make generator +Makefile + +# With ninja generator +.ninja_deps +.ninja_log +build.ninja + +# Built files +uchardet-config-version.cmake +uchardet-config.cmake +uchardet-targets.cmake +uchardet.pc +src/version.script + +# Build binaries +src/libuchardet.a +src/libuchardet.so* + +src/tools/uchardet +test/uchardet-tests + +# For Windows (untested) +src/libuchardet.dll + +src/tools/uchardet.exe +test/uchardet-tests.exe + +# For macOS (untested) +src/libuchardet.dylib
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/.gitlab-ci.yml
Added
@@ -0,0 +1,106 @@ +image: debian:testing + +stages: + - build + +variables: + GIT_DEPTH: "1" + +## GNU/Linux 64-bit CIs ## + +debian/testing-gcc: + stage: build + artifacts: + expire_in: 1 week + when: always + name: "uchardet-build-${CI_JOB_NAME}-${CI_COMMIT_REF_NAME}" + paths: + - _build + before_script: + - apt-get update + - apt-get install -y --no-install-recommends + build-essential + cmake + script: + - mkdir _build + - cd _build + - cmake .. + - make -j "$(nproc)" + - make test + - make install + +debian/testing-clang: + extends: debian/testing-gcc + variables: + CC: "clang" + CXX: "clang++" + before_script: + - apt-get update + - apt-get install -y --no-install-recommends + build-essential + clang + cmake + +## Windows CIs ## + +win64: + stage: build + artifacts: + expire_in: 1 week + when: always + name: "uchardet-build-${CI_JOB_NAME}-${CI_COMMIT_REF_NAME}" + paths: + - _build + before_script: + - apt-get update + - apt-get install -y --no-install-recommends + build-essential + cmake + cpio + gcc-mingw-w64-x86-64 + g++-mingw-w64-x86-64 + git + python3-distutils + python3-docutils + rpm + - apt-get install -y --reinstall ca-certificates + - git clone --depth=${GIT_DEPTH} git://git.tuxfamily.org/gitroot/crossroad/crossroad.git + - cd crossroad + - ./setup.py install --prefix=`pwd`/../.local + - cd .. + script: + - export PATH="`pwd`/.local/bin:$PATH" + - mkdir _build + - cd _build + - echo 'crossroad cmake .. && make && make install' | crossroad w64 gimp --run="-" + +win32: + stage: build + artifacts: + expire_in: 1 week + when: always + name: "uchardet-build-${CI_JOB_NAME}-${CI_COMMIT_REF_NAME}" + paths: + - _build + before_script: + - apt-get update + - apt-get install -y --no-install-recommends + build-essential + cmake + cpio + gcc-mingw-w64-i686 + g++-mingw-w64-i686 + git + python3-distutils + python3-docutils + rpm + - apt-get install -y --reinstall ca-certificates + - git clone --depth=${GIT_DEPTH} git://git.tuxfamily.org/gitroot/crossroad/crossroad.git + - cd crossroad + - ./setup.py install --prefix=`pwd`/../.local + - cd .. + script: + - export PATH="`pwd`/.local/bin:$PATH" + - mkdir _build + - cd _build + - echo 'crossroad cmake .. && make && make install' | crossroad w32 gimp --run="-"
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/CMakeLists.txt -> _service:tar_scm:uchardet-0.0.8.tar.xz/CMakeLists.txt
Changed
@@ -1,15 +1,16 @@ ######## Project settings -cmake_minimum_required(VERSION 2.8.5) +cmake_minimum_required(VERSION 3.1) +include(CheckCCompilerFlag) set (PACKAGE_NAME uchardet) project (${PACKAGE_NAME} CXX C) enable_testing() ######## Package information set (PACKAGE_URL https://www.freedesktop.org/wiki/Software/uchardet/) -set (PACKAGE_BUGREPORT https://bugs.freedesktop.org/enter_bug.cgi?product=uchardet) +set (PACKAGE_BUGREPORT https://gitlab.freedesktop.org/uchardet/uchardet/-/issues) set (UCHARDET_VERSION_MAJOR 0) set (UCHARDET_VERSION_MINOR 0) -set (UCHARDET_VERSION_REVISION 6) +set (UCHARDET_VERSION_REVISION 8) if (CMAKE_BUILD_TYPE MATCHES Debug) set (version_suffix .debug) @@ -19,13 +20,8 @@ UCHARDET_VERSION ${UCHARDET_VERSION_MAJOR}.${UCHARDET_VERSION_MINOR}.${UCHARDET_VERSION_REVISION}${version_suffix} ) - -######## Windows - -#if (WIN32) -# set(CMAKE_SHARED_LIBRARY_PREFIX ${CMAKE_INSTALL_PREFIX}) -# set(CMAKE_STATIC_LIBRARY_PREFIX ${CMAKE_INSTALL_PREFIX}) -#endif (WIN32) +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED ON) ######## Directory @@ -33,13 +29,31 @@ ######## Configuration -option(BUILD_BINARY "Build executable" ON) -option(BUILD_SHARED_LIBS "Build shared library and link executable to it" ON) +option(BUILD_BINARY "Build the CLI tool." ON) +option(BUILD_SHARED_LIBS "Build shared library and link executable to it." ON) +option(CHECK_SSE2 "Check and enable SSE2 extensions if supported. Disabling SSE on platforms which support it may decrease performances." ON) +set(TARGET_ARCHITECTURE "" CACHE STRING "Target CPU architecture. It is autodetected if not specified.") if (BUILD_SHARED_LIBS) option(BUILD_STATIC "Build static library" ON) endif (BUILD_SHARED_LIBS) +if (TARGET_ARCHITECTURE STREQUAL "") + string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" TARGET_ARCHITECTURE) +endif (TARGET_ARCHITECTURE STREQUAL "") + +if (TARGET_ARCHITECTURE MATCHES ".*(x86|amd|i686).*") + CHECK_C_COMPILER_FLAG(-msse2 SUPPORTS_CFLAG_SSE2) + CHECK_C_COMPILER_FLAG(-mfpmath=sse SUPPORTS_CFLAG_SSE_MATH) + if (CHECK_SSE2 AND SUPPORTS_CFLAG_SSE2 AND SUPPORTS_CFLAG_SSE_MATH) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse2 -mfpmath=sse") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2 -mfpmath=sse") + else (CHECK_SSE2 AND SUPPORTS_CFLAG_SSE2 AND SUPPORTS_CFLAG_SSE_MATH) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffloat-store") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffloat-store") + endif (CHECK_SSE2 AND SUPPORTS_CFLAG_SSE2 AND SUPPORTS_CFLAG_SSE_MATH) +endif (TARGET_ARCHITECTURE MATCHES ".*(x86|amd|i686).*") + configure_file( uchardet.pc.in uchardet.pc @@ -48,7 +62,7 @@ install( FILES - ${CMAKE_BINARY_DIR}/uchardet.pc + ${CMAKE_CURRENT_BINARY_DIR}/uchardet.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig ) @@ -58,3 +72,39 @@ add_subdirectory(src) add_subdirectory(doc) add_subdirectory(test) + +######## Exported targets + +install( + EXPORT UchardetTargets + FILE ${PACKAGE_NAME}-targets.cmake + NAMESPACE ${PACKAGE_NAME}:: + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PACKAGE_NAME} +) + +export( + EXPORT UchardetTargets + FILE "${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}-targets.cmake" + NAMESPACE ${PACKAGE_NAME}:: +) + +include(CMakePackageConfigHelpers) +write_basic_package_version_file( + ${PACKAGE_NAME}-config-version.cmake + VERSION ${UCHARDET_VERSION} + COMPATIBILITY AnyNewerVersion +) + +configure_file( + ${PACKAGE_NAME}-config.cmake.in + ${PACKAGE_NAME}-config.cmake + @ONLY +) + +install ( + FILES + "${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}-config.cmake" + "${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}-config-version.cmake" + DESTINATION + ${CMAKE_INSTALL_LIBDIR}/cmake/${PACKAGE_NAME} +)
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/INSTALL -> _service:tar_scm:uchardet-0.0.8.tar.xz/INSTALL
Changed
@@ -1,4 +1,26 @@ -Execute release.sh or manually make a directory and check in, and execute - cmake -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_BUILD_TYPE=Release && make -Then install - sudo make install +# Building uchardet (generic) + +`uchardet` uses a typical cmake installation. + +* Configure with `cmake`. There are various options. For instance to configure + with a prefix as a release-ready build: + +> cmake -DCMAKE_INSTALL_PREFIX=/home/jehan/.local -DCMAKE_BUILD_TYPE=Release + +Alternatively, use `ccmake`, curses interface to `cmake`. + +* Build with `make`. + +* Install with `make install`. + +Read `README` for more details on uchardet. + +# Building uchardet on Windows + +The above procedure is generic, which means it should work on any platform. +In particular, it works well on Linux. + +The procedure is the same on Windows, but if you want more details (for +instance which tools to use in order to run CMake on Windows, compiler +information, etc.), the following link may be useful: +https://github.com/BYVoid/uchardet/issues/39#issuecomment-353873891
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/README.md -> _service:tar_scm:uchardet-0.0.8.tar.xz/README.md
Changed
@@ -4,10 +4,6 @@ uchardet started as a C language binding of the original C++ implementation of the universal charset detection library by Mozilla. It can now detect more charsets, and more reliably than the original implementation. -The original code of universalchardet is available at http://lxr.mozilla.org/seamonkey/source/extensions/universalchardet/ - -Techniques used by universalchardet are described at http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html - ## Supported Languages/Encodings * International (Unicode) @@ -26,7 +22,20 @@ * EUC-TW * GB18030 * HZ-GB-2312 + * Croatian: + * ISO-8859-2 + * ISO-8859-13 + * ISO-8859-16 + * Windows-1250 + * IBM852 + * MAC-CENTRALEUROPE + * Czech + * Windows-1250 + * ISO-8859-2 + * IBM852 + * MAC-CENTRALEUROPE * Danish + * IBM865 * ISO-8859-1 * ISO-8859-15 * WINDOWS-1252 @@ -34,6 +43,19 @@ * ASCII * Esperanto * ISO-8859-3 + * Estonian + * ISO-8859-4 + * ISO-8859-13 + * ISO-8859-13 + * Windows-1252 + * Windows-1257 + * Finnish + * ISO-8859-1 + * ISO-8859-4 + * ISO-8859-9 + * ISO-8859-13 + * ISO-8859-15 + * WINDOWS-1252 * French * ISO-8859-1 * ISO-8859-15 @@ -50,13 +72,56 @@ * Hungarian: * ISO-8859-2 * WINDOWS-1250 + * Irish Gaelic + * ISO-8859-1 + * ISO-8859-9 + * ISO-8859-15 + * WINDOWS-1252 + * Italian + * ISO-8859-1 + * ISO-8859-3 + * ISO-8859-9 + * ISO-8859-15 + * WINDOWS-1252 * Japanese * ISO-2022-JP * SHIFT_JIS * EUC-JP * Korean * ISO-2022-KR - * EUC-KR + * EUC-KR / UHC + * Lithuanian + * ISO-8859-4 + * ISO-8859-10 + * ISO-8859-13 + * Latvian + * ISO-8859-4 + * ISO-8859-10 + * ISO-8859-13 + * Maltese + * ISO-8859-3 + * Norwegian + * IBM865 + * ISO-8859-1 + * ISO-8859-15 + * WINDOWS-1252 + * Polish: + * ISO-8859-2 + * ISO-8859-13 + * ISO-8859-16 + * Windows-1250 + * IBM852 + * MAC-CENTRALEUROPE + * Portuguese + * ISO-8859-1 + * ISO-8859-9 + * ISO-8859-15 + * WINDOWS-1252 + * Romanian: + * ISO-8859-2 + * ISO-8859-16 + * Windows-1250 + * IBM852 * Russian * ISO-8859-5 * KOI8-R @@ -64,10 +129,27 @@ * MAC-CYRILLIC * IBM866 * IBM855 + * Slovak + * Windows-1250 + * ISO-8859-2 + * IBM852 + * MAC-CENTRALEUROPE + * Slovene + * ISO-8859-2 + * ISO-8859-16 + * Windows-1250 + * IBM852 + * MAC-CENTRALEUROPE * Spanish * ISO-8859-1 * ISO-8859-15 * WINDOWS-1252 + * Swedish + * ISO-8859-1 + * ISO-8859-4 + * ISO-8859-9 + * ISO-8859-15 + * WINDOWS-1252 * Thai * TIS-620 * ISO-8859-11 @@ -94,9 +176,32 @@ dnf install uchardet uchardet-devel +### Gentoo + + emerge uchardet + ### Mac brew install uchardet + + or + + port install uchardet + +### Windows + +Binary packages are provided in Fedora and Msys2 repositories. There may +exist other pre-built packages but I am not aware of them. +Nevertheless the library is very easily and quickly compilable under +Windows as well, so finding a binary package is not necessary. +Some did it successfully with the CMake Windows +installer(https://cmake.org/download/) and MinGW. It should be possible +to use MinGW-w64 instead of MinGW, in particular to build both 32 and +64-bit DLL libraries). + +Note also that it is very easily cross-buildable (for instance from a +GNU/Linux machine; crossroad(https://pypi.org/project/crossroad/) may +help, this is what we use in our CI). ### Build from source @@ -105,24 +210,69 @@ If you prefer a development version, clone the git repository: - git clone git://anongit.freedesktop.org/uchardet/uchardet + git clone https://gitlab.freedesktop.org/uchardet/uchardet.git -The source can be browsed at: https://cgit.freedesktop.org/uchardet/uchardet/ +The source can be browsed at: https://gitlab.freedesktop.org/uchardet/uchardet cmake . make make install +### Build with flatpak-builder + +Here is a working "module" section to include in your Flatpak's json manifest: + +``` +"modules": + { + "name": "uchardet", + "buildsystem": "cmake", + "builddir": true, + "config-opts": "-DCMAKE_INSTALL_LIBDIR=lib" , + "sources": + { + ... + }
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/doc/README.maintainer
Added
@@ -0,0 +1,59 @@ +# How to do a uchardet release # + +* Update UCHARDET_VERSION_MAJOR, UCHARDET_VERSION_MINOR and + UCHARDET_VERSION_REVISION as needed in CMakeLists.txt. + +* Update README.md. + +* Commit the version change with the message "Release: version X.Y.Z." + +* Tag your release commit with: + + git tag -a vx.y.z + + The tag message should be have the header "Version x.y.z released." followed + by a list of new features or important fixes. This tag message will be + considered as the release note, hence have to be carefully crafted. + + Considering that the previous release was va.b.c, you can read the full list + of commits between a.b.c and x.y.z with: + + git log va.b.c.. + + This should help you to build a proper release note. + +* Push the release and the tag: + + git push + git push origin vx.y.z + +* Create a release tarball: + + git archive --format=tar.xz --prefix=uchardet-x.y.z/ vx.y.z >uchardet-x.y.z.tar.xz + +Note: if you have not already set this up, you have to run first: + + git config tar.tar.xz.command "xz -c" + +Cf. EXAMPLES section in `git help archive`. + +* Compute a SHA1 checksum: + + sha1sum uchardet-x.y.x.tar.xz > uchardet-x.y.z.tar.xz.sha1 + +* Upload to annarchy download server: + + scp uchardet-x.y.x.tar.xz uchardet-x.y.z.tar.xz.sha1 annarchy.freedesktop.org:/srv/www.freedesktop.org/www/software/uchardet/releases/ + + The archive and its checksum file should now be available from: + https://www.freedesktop.org/software/uchardet/releases/ + +* Make the git tag into a Gitlab release (not automatic). + It will be found at: https://gitlab.freedesktop.org/uchardet/uchardet/-/tags/vx.y.z + Just click the "Edit release notes" button, and copy paste the tag comment as "release notes". + +* Update the wiki page: https://www.freedesktop.org/wiki/Software/uchardet/ + The release note link will be: + https://gitlab.freedesktop.org/uchardet/uchardet/-/releases/vx.y.z + +* Spread the good news!
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/doc/uchardet.1 -> _service:tar_scm:uchardet-0.0.8.tar.xz/doc/uchardet.1
Changed
@@ -1,8 +1,8 @@ .TH UCHARDET "1" "July 2011" "uchardet " "User Commands" .SH NAME -uchardet \- universalchardet (Universal Charset Detector) +uchardet \- Universal Charset Detector .SH DESCRIPTION -uchardet Command Line Tool +uchardet CLI is an encoding detector utility, which takes one or several files in unknown character encoding without any additional information, and attempts to determine the encoding of the texts. Returned encoding names are iconv-compatible. .SS "Usage:" .HP uchardet \fBOptions\fR \fBFile\fR... @@ -15,4 +15,4 @@ .HP .IP .PP -uchardet Command Line Tool +uchardet Command Line Interface
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/script/BuildLangModel.py -> _service:tar_scm:uchardet-0.0.8.tar.xz/script/BuildLangModel.py
Changed
@@ -50,6 +50,7 @@ import sys import re import os +import random # Custom modules. import charsets.db @@ -240,12 +241,22 @@ return next_titles = + if options.max_page is not None: + max_titles = int(options.max_page/(options.max_depth * options.max_depth)) + else: + max_titles = sys.maxsize for title in titles: if options.max_page is not None and \ len(visited_pages) > options.max_page: return if title in visited_pages: continue + + # Ugly hack skipping internal pages + if 'wiki' in title or 'Wiki' in title: + print('Skipping', title) + continue + visited_pages += title try: page = wikipedia.page(title) @@ -255,13 +266,22 @@ print("Discarding page {}.\n".format(title)) continue logfd.write("\n{} (revision {})".format(title, page.revision_id)) + logfd.flush() process_text(page.content, lang) - next_titles += page.links + try: + links = page.links + random.shuffle(links) + if len(links) > max_titles: + links = links:max_titles + next_titles += links + except KeyError: + pass if depth >= options.max_depth: return + random.shuffle(next_titles) visit_pages (next_titles, depth + 1, lang, logfd) language_c = lang.name.replace('-', '_').title() @@ -274,6 +294,7 @@ if options.max_page is not None: logfd.write('\n- Max number of pages: {}'.format(options.max_page)) logfd.write('\n\n== Parsed pages ==\n') +logfd.flush() try: visit_pages(lang.start_pages, 0, lang, logfd) except requests.exceptions.ConnectionError: @@ -281,6 +302,7 @@ exit(1) logfd.write('\n\n== End of Parsed pages ==') logfd.write('\n\n- Wikipedia parsing ended at: {}\n'.format(str(datetime.datetime.now()))) +logfd.flush() ########### CHARACTERS ########### @@ -411,10 +433,18 @@ uchar = local_lowercase(uchar, lang) for order, (char, ratio) in enumerate(sorted_ratios): if char == ord(uchar): - CTOM_str += '{:3},'.format(order) + CTOM_str += '{:3},'.format(min(249, order)) break else: - CTOM_str += '{:3},'.format(n_char) + # XXX: we must make sure the character order does not go + # over the special characters (250 currently). This may + # actually happen when building a model for a language + # writable with many different encoding. So let's just + # ceil the order value at 249 max. + # It may be an interesting alternative to add another + # constant for any character with an order > freqCharCount. + # Maybe IRR (irrelevant character) or simply CHR. + CTOM_str += '{:3},'.format(min(249, n_char)) n_char += 1 CTOM_str += ' /* {:X}X */'.format(line) CTOM_str += '\n};\n/*' @@ -507,6 +537,7 @@ SM_str += '\n};' c_code += SM_str +c_code += '\n' lang_model_file = current_dir + '/../src/LangModels/Lang{}Model.cpp'.format(language_c) with open(lang_model_file, 'w') as cpp_fd:
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/BuildLangModelLogs/LangCroatianModel.log
Added
@@ -0,0 +1,157 @@ += Logs of language model for Croatian (hr) = + +- Generated by BuildLangModel.py +- Started: 2016-09-25 23:41:35.999066 +- Maximum depth: 5 +- Max number of pages: 100 + +== Parsed pages == + +Fizika čvrstog stanja (revision 4776646) +Agregatno stanje (revision 4663090) +Alnico (revision 3915185) +Aluminij (revision 4772363) +Amorfna tvar (revision 4659679) +Antimon (revision 4420072) +Antoine Henri Becquerel (revision 4634966) +Apsolutna nula (revision 4706683) +Arsen (revision 4540773) +Arthur Holly Compton (revision 4736068) +Atom (revision 4778162) +Atomska jezgra (revision 4540956) +Bell Labs (revision 4769518) +Bor (element) (revision 4602837) +Brian Josephson (revision 4403761) +Cink (revision 4537854) +Coulombov zakon (revision 4710338) +Dijamant (revision 4625335) +Dimenzija (revision 4669110) +Dinastija Han (revision 4541686) +Dislokacija (revision 4668021) +EV (revision 4538157) +Eksponencijalna funkcija (revision 4160157) +Električna struja (revision 4280621) +Električna vodljivost (revision 4460160) +Električni izolator (revision 4649046) +Električni luk (revision 4646980) +Električni naboj (revision 4727496) +Električni otpor (revision 4593314) +Električni vodič (revision 4333008) +Električno polje (revision 4705679) +Elektrolit (revision 4486319) +Elektromagnetsko zračenje (revision 4537368) +Elektron (revision 4630705) +Elektronika (revision 4090016) +Elektronska konfiguracija (revision 4420620) +Elektronski mikroskop (revision 4413214) +Elektrotehnika (revision 4596912) +Energetika (revision 4586277) +Energija (revision 4719089) +Fermi-Diracova statistika (revision 3934172) +Feromagnetizam (revision 4760511) +Fizika (revision 4769955) +Fizika kondenzirane tvari (revision 4769955) +Fizikalna veličina (revision 4621676) +Fosfor (revision 4602427) +Fotodioda (revision 3939069) +Fotoelektrični učinak (revision 4704417) +Foton (revision 4537522) +Fotonaponski sustavi (revision 4418887) +Francuski jezik (revision 4771366) +Galij (revision 4537855) +Genitiv (revision 4625199) +Germanij (revision 4537856) +Helij (revision 4747001) +Henri (revision 3922500) +Indij (revision 4537867) +Integrirani krug (revision 4447159) +Ion (revision 4549144) +Ioniziranje (revision 4566703) +Izolator (revision 4649046) +John Bardeen (revision 4403736) +Kadmij (revision 3921860) +Kelvin (revision 4624351) +Keramika (revision 4599177) +Kinetička energija (revision 4719090) +Klasična mehanika (revision 4637127) +Kompas (revision 4702880) +Kondenzacija (revision 4477825) +Kondenzirana tvar (revision 4776646) +Konstrukcija (revision 4680450) +Kovalentna veza (revision 4641419) +Kristal (revision 4720329) +Kristalna rešetka (revision 4479184) +Kristalografija (revision 4105956) +Krutine (revision 4625162) +Kubični kristalni sustav (revision 4344344) +Kubični metar (revision 4616551) +Kvantna mehanika (revision 4541215) +Latinski jezik (revision 4760544) +Luminiscencija (revision 4708222) +Magnet (revision 4603344) +Magnetizam (revision 4760040) +Magnetska permeabilnost (revision 4675996) +Magnetska vodljivost (revision 4736934) +Magnetski moment (revision 4410235) +Magnetsko polje (revision 4678057) +Materijal (revision 4669230) +Mehanika (revision 4698699) +Metal (revision 4671710) +Metan (revision 4422418) +Metar (revision 4655527) +Mjerna veličina (revision 4621676) +Molekula (revision 4539232) +Molekule (revision 4539232) +Napon (revision 4585417) +Niskotemperaturna fizika (revision 4657522) +Njemački jezik (revision 4731246) +Optika (revision 4768098) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2016-09-25 23:50:27.589690 + +49 characters appeared 500582 times. + +First 31 characters: + 0 Char a: 10.808019465342342 % + 1 Char i: 10.18554402675286 % + 2 Char e: 9.571259054460608 % + 3 Char o: 8.468143081453189 % + 4 Char n: 6.952906816465634 % + 5 Char t: 5.369549843981606 % + 6 Char r: 5.331993559496746 % + 7 Char j: 5.102860270644969 % + 8 Char s: 4.717109284792501 % + 9 Char k: 4.013927788054705 % +10 Char l: 3.854713113935379 % +11 Char u: 3.786792173909569 % +12 Char m: 3.730058212240951 % +13 Char v: 3.0989927724129114 % +14 Char p: 2.67308852495695 % +15 Char d: 2.6135578186990345 % +16 Char z: 1.8931963194841206 % +17 Char g: 1.5665765049482403 % +18 Char č: 1.161048539500022 % +19 Char b: 1.1440683044935693 % +20 Char c: 1.007627122029957 % +21 Char h: 0.8006680224219008 % +22 Char f: 0.5159993767254915 % +23 Char š: 0.422907735395999 % +24 Char ž: 0.3611795869607777 % +25 Char ć: 0.34959307366225717 % +26 Char đ: 0.2195444502598975 % +27 Char y: 0.11306838839590717 % +28 Char w: 0.07291512679241363 % +29 Char x: 0.04534721584076135 % +30 Char q: 0.02477116636235422 % + +The first 31 characters have an accumulated ratio of 0.9997702674087363. + +712 sequences found. + +First 512 (typical positive ratio): 0.9989731099787131 +Next 512 (512-1024): 1.9976747066414694e-06 +Rest: 3.7513395167998453e-17 + +- Processing end: 2016-09-25 23:50:27.987029
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/BuildLangModelLogs/LangCzechModel.log
Added
@@ -0,0 +1,161 @@ += Logs of language model for Czech (cs) = + +- Generated by BuildLangModel.py +- Started: 2016-09-21 03:20:56.824516 +- Maximum depth: 5 +- Max number of pages: 100 + +== Parsed pages == + +Sociální fobie (revision 13567590) +Adaptace (revision 13991192) +Agorafobie (revision 13013445) +Alkoholismus (revision 13822064) +Alprazolam (revision 14082425) +Antidepresivum (revision 14113423) +Asertivita (revision 14111958) +Atenolol (revision 12051880) +Automatické negativní myšlenky (revision 13567590) +Benzodiazepin (revision 13947546) +Beta-blokátory (revision 13428762) +Blud (revision 13888988) +Bohatství (revision 13556478) +Bupropion (revision 13686045) +Citaloparam (revision 13567590) +Clonazepan (revision 13567590) +Crohnova nemoc (revision 13745254) +Deprese (psychologie) (revision 13695735) +Diagnostický a statický manuál mentálních poruch (revision 13567590) +Diagnostický a statistický manuál mentálních poruch (revision 13714660) +Diagnóza (medicína) (revision 13052239) +Dichotomické myšlení (revision 13567590) +Digital object identifier (revision 14138049) +Dopamin (revision 13714274) +Dystymie (revision 13567267) +Důkaz kruhem (revision 13190761) +Elektivní mutismus (revision 9940891) +Emoce (revision 14110033) +Escitalopram (revision 12954987) +Evoluce (revision 13951488) +Expozice (psychologie) (revision 14119474) +Extraverze a introverze (revision 13872996) +Fluoxetin (revision 12955006) +Fluvoxamin (revision 12955006) +Gen (revision 13907182) +Generalizovaná úzkostná porucha (revision 14006709) +Halucinaci (revision 12188143) +Hněv (revision 14057864) +Inteligence (revision 14009781) +International Standard Serial Number (revision 12869806) +Interpersonální psychoterapie (revision 13567590) +Iracionalita (revision 4765977) +Ján Praško Pavlov (revision 14086840) +Klinické testování (revision 13530979) +Kognitivní omyl (revision 13107294) +Kognitivní psychologie (revision 11629465) +Kognitivní restrukturalizace (revision 13567360) +Kognitivně behaviorální terapie (revision 13980494) +Komorbidita (revision 11351714) +Lymská borelióza (revision 14068446) +Malé sebevědomí (revision 13567590) +Medical Subject Headings (revision 12239331) +Meditace (revision 13180783) +Mentální černý filtr (revision 13567590) +Mezinárodní klasifikace nemocí (revision 12531067) +Michael Liebowitz (revision 13567590) +Moclobemid (revision 13567590) +Moritova terapie (revision 11960292) +Musturbace (revision 13567590) +Nervozita (revision 13847097) +Noradrenalin (revision 14054165) +Obsedantně kompulzivní porucha (revision 13950365) +Panická ataka (revision 13253537) +Panická porucha (revision 13253537) +Paranoia (revision 14027052) +Paroxetin (revision 12955006) +Pohlavnost (revision 13564689) +Porucha (revision 11039108) +Pravděpodobnost (revision 13596041) +Predestinace (revision 12467403) +Profese (revision 13975485) +Propanolol (revision 12972658) +Psychiatr (revision 12767960) +Psychické trauma (revision 11227535) +Psychoaktivní droga (revision 13939232) +Psychodynamická léčba (revision 13567590) +Psychofarmaka (revision 9928215) +Psycholog (revision 12358728) +Psychoterapie (revision 13874178) +Puberta (revision 12540014) +RIMA (revision 10234728) +Remise (revision 9896748) +Richard Heimberg (revision 13567590) +Rámování myšlenek (revision 13567590) +Schizofrenie (revision 13977456) +Sebevražda (revision 14053884) +Selektivní abstrakce (revision 13567590) +Selektivní inhibitor zpětného vychytávání serotoninu (revision 12955027) +Serotonin (revision 13975104) +Sertralin (revision 12955006) +Skupinová terapie (revision 11964235) +Sociální chování (revision 13507313) +Sociální dovednost (revision 12226347) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2016-09-21 03:28:11.731386 + +47 characters appeared 594800 times. + +First 41 characters: + 0 Char o: 8.323806321452588 % + 1 Char e: 8.040013449899126 % + 2 Char n: 6.895595158036315 % + 3 Char a: 6.263113651647613 % + 4 Char i: 5.650470746469401 % + 5 Char t: 5.40383322125084 % + 6 Char s: 4.588937457969065 % + 7 Char v: 3.8685272360457295 % + 8 Char p: 3.6914929388029587 % + 9 Char r: 3.6302958977807664 % +10 Char l: 3.6017148621385338 % +11 Char í: 3.5733019502353733 % +12 Char k: 3.301950235373235 % +13 Char u: 3.1782111634162744 % +14 Char c: 3.1383658372562206 % +15 Char d: 3.120208473436449 % +16 Char m: 2.758406186953598 % +17 Char h: 2.2747141896435776 % +18 Char á: 2.156186953597848 % +19 Char z: 2.0260591795561536 % +20 Char y: 1.9894082044384667 % +21 Char j: 1.8979488903833224 % +22 Char b: 1.8189307330195021 % +23 Char ě: 1.277236045729657 % +24 Char é: 1.2291526563550772 % +25 Char č: 0.9502353732347008 % +26 Char ž: 0.9214862138533961 % +27 Char ř: 0.8955951580363146 % +28 Char ý: 0.7646267652992602 % +29 Char š: 0.6605581708137189 % +30 Char f: 0.6260928043039677 % +31 Char ů: 0.5016812373907196 % +32 Char g: 0.47041022192333554 % +33 Char ú: 0.19502353732347008 % +34 Char x: 0.13685272360457296 % +35 Char ň: 0.05447209145931405 % +36 Char w: 0.04488903833221251 % +37 Char ó: 0.03429724277067922 % +38 Char ť: 0.02269670477471419 % +39 Char ď: 0.012104909213180902 % +40 Char q: 0.007229320780094149 % + +The first 41 characters have an accumulated ratio of 0.9999613315400132. + +1025 sequences found. + +First 512 (typical positive ratio): 0.9786035192432675 +Next 512 (512-1024): 1.6812373907195695e-06 +Rest: 2.0246480655940202e-06 + +- Processing end: 2016-09-21 03:28:12.235582
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/script/BuildLangModelLogs/LangDanishModel.log -> _service:tar_scm:uchardet-0.0.8.tar.xz/script/BuildLangModelLogs/LangDanishModel.log
Changed
@@ -1,158 +1,256 @@ = Logs of language model for Danish (da) = - Generated by BuildLangModel.py -- Started: 2016-02-19 17:53:58.564190 -- Maximum depth: 4 -- Max number of pages: 100 +- Started: 2022-11-30 19:37:01.097250 +- Maximum depth: 2 +- Max number of pages: 200 == Parsed pages == -Forside (revision 2692411) -16. februar (revision 6877446) -17. februar (revision 8454583) -1878 (revision 8280505) -19. februar (revision 8206479) -1922 (revision 8455105) -1926 (revision 8425271) -1942 (revision 8443554) -1945 (revision 8448461) -1948 (revision 8454392) -1985 (revision 8409096) -2. verdenskrig (revision 8433181) -23. oktober (revision 6877825) -26. oktober (revision 7849938) -3C 273 (revision 8443798) -A-bus (revision 8427319) -Aktuelle begivenheder (revision 8440596) -B-52 Stratofortress (revision 8422571) -Borgerkrigen i Syrien (revision 8447763) -Boutros Boutros-Ghali (revision 8453935) -Brasilien (revision 8452750) -Cusco (region) (revision 7693764) -Danmark (revision 8451178) -Danmark i Eurovision Song Contest (revision 8453514) -Dansk (sprog) (revision 8455750) -Dansk Melodi Grand Prix 2016 (revision 8452164) -Dobbeltmordet på Peter Bangs Vej (revision 8334648) -Encyklopædi (revision 8446641) -Eritrea-sagen (revision 8452285) -Eurovision Song Contest 2014 (revision 8445804) -Eurovision Song Contest 2016 (revision 8453588) -Flygtningekrisen i Europa 2015 (revision 8452286) -Fonograf (revision 8177165) -Formel 1 (revision 8450846) -Formel 1 2016 (revision 8456463) -Frederik 6. (revision 8438503) -Første observation af gravitationsbølger (revision 8451269) -Grammofon (revision 8375093) -Guadalcanal (revision 7796248) -Harper Lee (revision 8456583) -Hartkorn (revision 8437552) -IC4 (revision 8446402) -IC4-sagen (revision 8434463) -Islamisk Stat (revision 8439228) -Jonathan Leunbach (revision 8452603) -Juliane Marie af Braunschweig-Wolfenbüttel (revision 8437957) -Kaliumklorid (revision 8452216) -Kejserriget Japan (revision 8044942) -Kevin Magnussen (revision 8455302) -København (revision 8427847) -LIGO (revision 8451266) -Latinamerika (revision 7692181) -Leonid Hurwicz (revision 8445727) -Lighthouse X (revision 8452940) -Linkoban (revision 8455879) -Machu Picchu (revision 8406907) -Matador (tv-serie) (revision 8454648) -Middelaldercentret (revision 8449194) -Nobelprisen (revision 8409809) -Nykøbing Falster (revision 8452825) -Nyligt afdøde (revision 8456580) -Overvågning (revision 8455039) -Panorama (foto) (revision 8448393) -Peru (revision 8437485) -Peter Lauritsen (revision 8456097) -Professor (revision 8415451) -Renault F1 (revision 8450843) -S-bus (revision 8455589) -Salomonøerne (revision 8238961) -Slaget om Belgien (1940) (revision 8430013) -Slaget om Guadalcanal (revision 7762887) -Slaget om Henderson Field (revision 8445480) -Slaget om Iwo Jima (revision 8145239) -Soldiers of Love (Lighthouse X-sang) (revision 8452929) -Solen (revision 8276478) -Stillehavskrigen (revision 8430649) -Stockholm (revision 8358042) -Søslaget ved Guadalcanal (revision 7772812) -Thomas Edison (revision 8282441) -Togulykken ved Bad Aibling (revision 8455364) -Topografi (revision 6886168) -USA (revision 8448088) -United States Army (revision 8401635) -United States Marine Corps (revision 8401667) -Vestallierede (revision 6961443) -Wikimedia (revision 8263252) -Wikipedia (revision 8267051) -Zikavirus (revision 8454832) -1. februar (revision 8404985) -10. februar (revision 6877431) -11. februar (revision 6877433) -12. februar (revision 6877437) -13. februar (revision 6877438) -14. februar (revision 6877441) -1497 (revision 7369489) -15. februar (revision 7329463) -1560 (revision 7874693) -1568 (revision 7369703) -1620 (revision 7423903) -1688 (revision 7367090) -18. februar (revision 6877450) +Forside (revision 10000691) +Hans Magnus Enzensberger (revision 11341046) +28. november (revision 9410945) +Golfkrigen (revision 11144370) +29. november (revision 6877900) +8. december (revision 10277754) +Det Konservative Folkeparti (revision 11313857) +1990 (revision 11340072) +1940 (revision 11263756) +Angolas håndboldlandshold (damer) (revision 11331888) +Skjoldvulkan (revision 10870812) +Casper & Mandrilaftalen (revision 11221713) +26. november (revision 10617630) +Døde i 2022 (revision 11343986) +Vikingetidens rustning og våben (revision 11332607) +Middelaldercentret (revision 11339897) +Ruslands invasion af Ukraine 2022 (revision 11335164) +Saddam Hussein (revision 11002258) +The Jimi Hendrix Experience (revision 10497780) +Færøerne (revision 11333678) +27. november (revision 9745974) +Thomas Vinterberg (revision 11234643) +Anwar Ibrahim (revision 11342876) +Mandatområdet i Palæstina (revision 11341286) +Kunst (revision 11336917) +Afrikamesterskabet i håndbold 2022 (kvinder) (revision 11341917) +Dansk (sprog) (revision 11313509) +Sergej Sjojgu (revision 11309097) +Fernando Gomes (revision 11340427) +Folketinget (revision 11330485) +15. januar (revision 10515606) +Rock and Roll Hall of Fame (revision 8408189) +Thomas Edison (revision 11052704) +Ukraine (revision 11334630) +1947 (revision 11252357) +1937 (revision 11303923) +IC4 (revision 11317878) +Jimi Hendrix (revision 11341476) +Ismail Sabri Yaakob (revision 11105534) +Okipa-ceremonien (revision 11340589) +SI-præfiks (revision 11332802) +Sporvejsmuseet Skjoldenæsholms historie (revision 11338275) +Irak (revision 11255676) +Woodstockfestivalen (revision 11226413) +Nikolaj Lie Kaas (revision 11322663) +Torben Rechendorff (revision 11342962) +Folketingsvalget 2022 (revision 11339557) +Kherson (revision 11314559) +Keltere (revision 11318773) +Little Richard (revision 11226619) +Invasion (revision 10307980) +Tate Gallery (revision 8312688) +24. januar (revision 10441562) +Hans Christian Ægidius (revision 9773029) +Slaget ved Irpin (1321) (revision 11230064) +Auschwitz (revision 11310714) +Jazz fusion (revision 11223082) +Lutsk (revision 11248429) +Planetarium (revision 11266837) +Bibliothèque nationale de France (revision 11055813) +Digtsamling (revision 10585337) +Kenneth Gøtterup (revision 11027437) +Straf (revision 11007456) +1716 (revision 11339928) +Kamel (revision 11285016) +Amnesti (revision 10831621) +Zulu Royal (revision 10969220) +Stephen Roche (revision 11239346) +13. december (revision 10768225) +Enhed (politisk parti) (revision 10158693) +The Everly Brothers (revision 10865882) +3. november (revision 9423371) +Annelise Gotfredsen (revision 11306090) +Virtual International Authority File (revision 8702589) +Europæiske Fællesskab (revision 10868689) +Væringer (revision 11331002) +Rom (revision 11341285) +Decentralisering (revision 11154770) +Kreml (Moskva) (revision 11045482) +Folketingsvalget 1994 (revision 11266325) +28. december (revision 6878014) +Østjyllands Storkreds (revision 11201505) +Bruxelles (revision 10802416) +Erik Haunstrup Clemmensen (revision 10627614) +Hviderussere (revision 10750673) +Hvidmelet Gåsefod (revision 11317723)
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/BuildLangModelLogs/LangEstonianModel.log
Added
@@ -0,0 +1,159 @@ += Logs of language model for Estonian (et) = + +- Generated by BuildLangModel.py +- Started: 2016-09-26 23:45:22.351942 +- Maximum depth: 5 +- Max number of pages: 100 + +== Parsed pages == + +Harilik pohl (revision 4248853) +A-vitamiin (revision 4330862) +Aasta keskmine sademete hulk (revision 4266801) +Aasta keskmine õhutemperatuur (revision 3902142) +Ahm (revision 4343671) +Ain Raal (revision 4464651) +Alalehed (revision 2892741) +Alamliik (revision 3522810) +Alaska (revision 4216575) +Aleksander Heintalu (revision 4445156) +Aleuudid (revision 4335893) +Ameerika jänes (revision 4325220) +Ameerika valgejänes (revision 4355263) +Anneli Sihvart (revision 4211078) +Arbutiin (revision 4451788) +Baribal (revision 4268462) +Bensoehape (revision 3810308) +Binaarne nomenklatuur (revision 3970950) +C-vitamiin (revision 4444353) +Droog (revision 4352968) +E-vitamiin (revision 4336726) +Eesti (revision 4474984) +Eesti Entsüklopeediakirjastus (revision 4012421) +Eesti köök (revision 4314947) +Ellips (revision 4272113) +Emakakael (botaanika) (revision 3521516) +Euraasia (revision 3710768) +Fenoloogia (revision 3512905) +Folaadid (revision 4266628) +Fosfor (revision 4270122) +Fotosüntees (revision 4380600) +Fruktoos (revision 4285660) +Glükoos (revision 4047315) +Gneiss (revision 4333338) +Graniit (revision 4435351) +Gröönimaa (revision 4331557) +Halljänes (revision 4051603) +Haned (revision 4127680) +Happeline keskkond (revision 2966453) +Heilongjiang (revision 4342364) +Hendrik Relve (revision 4342591) +Hiina (revision 4448121) +Holland (revision 4307885) +Hunt (revision 4427752) +Hõimkond (revision 3489569) +Hüdrofiilsus (revision 4309797) +Ida-Euroopa (revision 4337624) +Ida-sinilind (revision 4248853) +Ida-vöötorav (revision 3520679) +Igihaljus (revision 3536500) +Ilves (revision 4404632) +Imetaja (revision 4289188) +Indiaanlased (revision 4479868) +Indrek Rohtmets (revision 4218674) +Itaalia (revision 4404119) +Jaapan (revision 4465542) +Jilin (revision 3894473) +Jood (revision 4025060) +Juurestik (revision 3341159) +Jääkaru (revision 4372399) +Jõhvikas (revision 4391549) +Kaalium (revision 4486067) +Kaheidulehelised (revision 4031352) +Kaheli õiekate (revision 3063362) +Kahesuguline õis (revision 3383221) +Kaitsestaatus (revision 3527096) +Kajakas (revision 4456839) +Kalorsus (revision 3843290) +Kaltsium (revision 4339861) +Kanada (revision 4434682) +Kanalised (revision 3616579) +Kanarbikulaadsed (revision 4318215) +Kanarbikulised (revision 3534760) +Karboksüülhapped (revision 3659011) +Karoteen (revision 4347634) +Kasvuperiood (revision 4231717) +Katteseemnetaimed (revision 4176294) +Kaukasus (revision 4476003) +Kesk-Euroopa (revision 3580746) +Kimalane (revision 4261145) +Kiudained (toit) (revision 3538655) +Klass (bioloogia) (revision 3489567) +Kliima (revision 4160781) +Korea (revision 4329396) +Kroom (revision 4030460) +Kroonlehed (revision 3543291) +Kuusepüü (revision 4028988) +Kvertsetiin (revision 4448461) +Laanemets (revision 4001157) +Laanepüü (revision 4475093) +Laiuskraad (revision 3990366) +Leesikas (revision 4420533) +Lehed (revision 4471821) +Leheroots (revision 3595351) +Liik (bioloogia) (revision 4320981) +Liiv (revision 4399494) +Liivakivi (revision 4330598) +Linnaeus (revision 4276836) +Linnud (revision 4479668) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2016-09-26 23:47:54.476445 + +55 characters appeared 433559 times. + +First 33 characters: + 0 Char a: 12.486881831538499 % + 1 Char i: 10.26503889897338 % + 2 Char e: 10.177622884082673 % + 3 Char s: 8.710233209320991 % + 4 Char t: 6.56634967789851 % + 5 Char l: 6.051540851418146 % + 6 Char u: 5.423944607308348 % + 7 Char n: 5.131020230233947 % + 8 Char k: 4.663033174262327 % + 9 Char o: 4.526950195936424 % +10 Char d: 4.167368224393911 % +11 Char r: 3.6740097656835635 % +12 Char m: 3.552688330769284 % +13 Char v: 2.4700213811730354 % +14 Char p: 1.9229216784797456 % +15 Char g: 1.865259399528092 % +16 Char h: 1.8043680329551455 % +17 Char j: 1.6860450365463524 % +18 Char ä: 1.0247740215287884 % +19 Char b: 0.9255949017319443 % +20 Char õ: 0.9246723052687178 % +21 Char ü: 0.6536595941959457 % +22 Char f: 0.37342091849090897 % +23 Char c: 0.34851081398379463 % +24 Char ö: 0.24333481717597835 % +25 Char y: 0.1287022066200909 % +26 Char x: 0.06781084004714467 % +27 Char w: 0.04082489349777078 % +28 Char q: 0.020989069538401926 % +29 Char š: 0.018913227496142396 % +30 Char z: 0.017529332801302706 % +31 Char ō: 0.010379210211297655 % +32 Char ž: 0.009687262863877812 % + +The first 33 characters have an accumulated ratio of 0.9995410082595447. + +853 sequences found. + +First 512 (typical positive ratio): 0.9972721312183132 +Next 512 (512-1024): 9.687262863877811e-05 +Rest: -5.204170427930421e-18 + +- Processing end: 2016-09-26 23:47:54.561846
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/BuildLangModelLogs/LangFinnishModel.log
Added
@@ -0,0 +1,156 @@ += Logs of language model for Finnish (fi) = + +- Generated by BuildLangModel.py +- Started: 2016-09-21 18:12:24.181917 +- Maximum depth: 5 +- Max number of pages: 100 + +== Parsed pages == + +Yhdistynyt kuningaskunta (revision 15843357) +1. toukokuuta (revision 15910178) +1700-luku (revision 15493702) +1707 (revision 15106709) +1800-luku (revision 15708929) +2014 (revision 15891601) +409 (revision 12809782) +5. marraskuuta (revision 15421719) +927 (revision 12785964) +Aasia (revision 15948161) +Abhasia (revision 15730328) +Adolf Hitler (revision 15951829) +Afrikka (revision 15934209) +Agatha Christie (revision 15760740) +Aikavyöhyke (revision 15800313) +Ajoneuvon kansallisuustunnus (revision 15897445) +Akrotiri ja Dhekelia (revision 14625383) +Alamaat (revision 15913741) +Alan Turing (revision 15904871) +Alankomaat (revision 15936643) +Albania (revision 15767604) +Alec Guinness (revision 15363805) +Alexander Fleming (revision 15023225) +Alfred Hitchcock (revision 15892843) +Alfred Tennyson (revision 15856114) +Allen Jones (revision 12871703) +Andorra (revision 15913862) +Andrew Lloyd Webber (revision 14978349) +Anglit (revision 15902350) +Anguilla (revision 15854041) +Anne Brontë (revision 14287992) +Anthony Eden (revision 14391831) +Antigua ja Barbuda (revision 15196967) +Arabian Lawrence (revision 15736417) +Argentiina (revision 15676474) +Armenia (revision 15634470) +Arthur Conan Doyle (revision 15402837) +Arts and Crafts (revision 15806930) +Aurinko (revision 15934252) +Australia (revision 15934255) +Avara luonto (revision 15815943) +Azerbaidžan (revision 15946891) +BBC (revision 15866026) +BKT (revision 15656549) +Bahama (revision 15516869) +Bangladesh (revision 15883994) +Bank of England (revision 14481173) +Barbados (revision 15839821) +Barbara Hepworth (revision 15106880) +Bath (revision 15869900) +Beatrix Potter (revision 15057380) +Belfast (revision 15715934) +Belgia (revision 15932391) +Belize (revision 15665086) +Ben Nevis (revision 15610196) +Bengalin kieli (revision 15551820) +Benjamin Britten (revision 15081615) +Bermuda (revision 15632621) +Bertrand Russell (revision 14631969) +Bhutan (revision 15377394) +Big Ben (revision 14897401) +Big Brother (revision 14641391) +Birmingham (revision 15855259) +Black Sabbath (revision 15839917) +Bosnia ja Hertsegovina (revision 15934266) +Botswana (revision 15524955) +Bristol (revision 15891889) +Bristolin kanaali (revision 15849713) +Bristolin kansainvälinen lentoasema (revision 14452870) +Britannia (provinssi) (revision 14557442) +Britannian avoin golfturnaus (revision 14293265) +Britannian kuninkaallinen perhe (revision 15522149) +Britannian talous (revision 15470242) +Britannian väestö (revision 15661241) +Brittein saaret (revision 15805422) +Brittiläinen Antarktiksen alue (revision 15836227) +Brittiläinen Intia (revision 15593126) +Brittiläinen Intian valtameren alue (revision 14272903) +Brittiläinen imperiumi (revision 15906600) +Brittiläinen kansainyhteisö (revision 15894379) +Brittiläinen keittiö (revision 13393533) +Brittiläinen kulttuuri (revision 15951407) +Brittiläiset Neitsytsaaret (revision 15910520) +Brittiläiset merentakaiset alueet (revision 15836213) +Brunei (revision 15580824) +Bruttokansantuote (revision 15656549) +Bulgaria (revision 15944101) +Burma (revision 15627218) +Cambridge (revision 14641664) +Cambridgen yliopisto (revision 15493340) +Canterburyn tarinoita (revision 15232140) +Cardiff (revision 15840398) +Caymansaaret (revision 15914575) +Channel 4 (revision 15882475) +Charles Babbage (revision 15203616) +Charles Chaplin (revision 15674652) +Charles Darwin (revision 15894085) +Charles Dickens (revision 15699592) +Charles Dickensin joulutarina (revision 15116247) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2016-09-21 18:15:05.189221 + +61 characters appeared 940364 times. + +First 30 characters: + 0 Char a: 12.508773198463574 % + 1 Char i: 10.969475649854738 % + 2 Char n: 8.815841525196626 % + 3 Char t: 8.80169806585535 % + 4 Char e: 7.8206949649284745 % + 5 Char s: 7.595782058862313 % + 6 Char l: 5.963541777439374 % + 7 Char o: 5.439808414613916 % + 8 Char u: 5.0102938861972595 % + 9 Char k: 4.589712068943515 % +10 Char r: 3.1231523112326713 % +11 Char ä: 3.041800834570443 % +12 Char m: 3.0392486313810396 % +13 Char v: 2.156292669647073 % +14 Char h: 1.996141919512019 % +15 Char j: 1.9248929138078446 % +16 Char p: 1.6324529650220552 % +17 Char y: 1.6323466232224966 % +18 Char d: 1.1981530556252684 % +19 Char b: 0.6835650875618378 % +20 Char g: 0.5793501239945382 % +21 Char c: 0.5056552569005194 % +22 Char ö: 0.38931732818355447 % +23 Char f: 0.215023118707224 % +24 Char w: 0.2106631049253268 % +25 Char z: 0.06593191572625068 % +26 Char x: 0.024458613898447838 % +27 Char š: 0.010421496356729947 % +28 Char ž: 0.007869293167326695 % +29 Char q: 0.007762951367768225 % + +The first 30 characters have an accumulated ratio of 0.9996012182516557. + +919 sequences found. + +First 512 (typical positive ratio): 0.9985378147555799 +Next 512 (512-1024): 1.0634179955846884e-06 +Rest: 3.881443777498106e-17 + +- Processing end: 2016-09-21 18:15:05.307164
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/BuildLangModelLogs/LangIrishModel.log
Added
@@ -0,0 +1,156 @@ += Logs of language model for Irish (ga) = + +- Generated by BuildLangModel.py +- Started: 2016-09-27 00:31:16.489602 +- Maximum depth: 5 +- Max number of pages: 100 + +== Parsed pages == + +Tracy Caldwell Dyson (revision 812158) +14 Lúnasa (revision 716575) +1969 (revision 810361) +California (revision 790976) +Ceimic (revision 759983) +Ceimic fhisiciúil (revision 656896) +NASA (revision 806394) +Rúisis (revision 771746) +SAM (revision 807668) +Spáinnis (revision 812323) +Stáisiún Idirnáisiúnta Spáis (revision 806394) +Tointeálaí spáis (revision 761309) +10 Lúnasa (revision 649045) +11 Lúnasa (revision 776455) +12 Lúnasa (revision 716531) +13 Lúnasa (revision 716546) +1598 (revision 703178) +15 Lúnasa (revision 776986) +16 Lúnasa (revision 648836) +1740 (revision 791225) +1771 (revision 776762) +17 Lúnasa (revision 777131) +1823 (revision 791774) +1832 (revision 794492) +1898 (revision 805176) +18 Lúnasa (revision 777242) +1911 (revision 801932) +1956 (revision 797081) +1962 (revision 801511) +1966 (revision 807415) +19 Lúnasa (revision 648524) +1 Lúnasa (revision 647726) +2001 (revision 801012) +2004 (revision 795759) +2016 (revision 812091) +20 Lúnasa (revision 777924) +21 Lúnasa (revision 647805) +22 Lúnasa (revision 778960) +23 Lúnasa (revision 778453) +24 Lúnasa (revision 778495) +25 Lúnasa (revision 778551) +26 Lúnasa (revision 649051) +27 Lúnasa (revision 778763) +28 Lúnasa (revision 778813) +29 Lúnasa (revision 778959) +2 Lúnasa (revision 774393) +30 Lúnasa (revision 648308) +31 Lúnasa (revision 649053) +3 Lúnasa (revision 647811) +4 Lúnasa (revision 786284) +5 Lúnasa (revision 776845) +6 Lúnasa (revision 647834) +7 Lúnasa (revision 775859) +8 Lúnasa (revision 648745) +9 Lúnasa (revision 648522) +AK Parti (revision 792248) +An Phacastáin (revision 759339) +An Tuirc (revision 811970) +Aoine (revision 717430) +Bertolt Brecht (revision 800584) +Czesław Miłosz (revision 780306) +Céadaoin (revision 717606) +Dan Boyle (revision 797926) +Domhnach (revision 717663) +Déardaoin (revision 647860) +Féilire (revision 648837) +Halle Berry (revision 759955) +Henry Bagenal (revision 716575) +Iúil (revision 647071) +Luan (revision 717791) +Lúnasa (revision 810265) +Meán Fómhair (revision 779166) +Pápa Pius VII (revision 758126) +Satharn (revision 784525) +Walter Scott (revision 759029) +Áth Buí (revision 716575) +11 Márta (revision 716519) +17 Márta (revision 798614) +1882 (revision 801198) +1886 (revision 776624) +1890 (revision 801200) +1891 (revision 796677) +1903 (revision 812849) +1922 (revision 801227) +1930í (revision 740221) +1940í (revision 740219) +1950í (revision 740217) +1960í (revision 772724) +1967 (revision 796983) +1968 (revision 810926) +1970 (revision 812852) +1970í (revision 740213) +1971 (revision 809746) +1972 (revision 789490) +1980í (revision 740211) +1990í (revision 740208) +19ú haois (revision 739964) +1 Bealtaine (revision 647679) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2016-09-27 00:33:40.157338 + +44 characters appeared 183561 times. + +First 31 characters: + 0 Char a: 15.192769705983297 % + 1 Char i: 10.534372769814938 % + 2 Char n: 8.106297089250985 % + 3 Char h: 7.243368689427493 % + 4 Char r: 6.442544985045844 % + 5 Char e: 6.198484427520007 % + 6 Char s: 5.622654049607488 % + 7 Char t: 4.776068990689743 % + 8 Char c: 4.543448771797931 % + 9 Char l: 4.1953356105054995 % +10 Char o: 3.9469168287381304 % +11 Char d: 3.2169142682813887 % +12 Char g: 2.811054635788648 % +13 Char m: 2.6269196615838877 % +14 Char á: 2.2749930540801153 % +15 Char u: 2.1932763495513754 % +16 Char b: 2.0478206154902185 % +17 Char í: 1.6599386579938005 % +18 Char é: 1.2829522611012143 % +19 Char f: 1.1494816437042727 % +20 Char ú: 1.0525111543301682 % +21 Char p: 0.9059658642086281 % +22 Char ó: 0.8890777452726886 % +23 Char v: 0.2522322279787101 % +24 Char y: 0.23479933101257894 % +25 Char k: 0.18195586208399386 % +26 Char w: 0.1688811893593955 % +27 Char j: 0.09697048937410452 % +28 Char z: 0.07735848028720697 % +29 Char x: 0.0343210159020707 % +30 Char q: 0.010895560603831969 % + +The first 31 characters have an accumulated ratio of 0.9997058198636966. + +701 sequences found. + +First 512 (typical positive ratio): 0.9974076651249096 +Next 512 (512-1024): 5.447780301915984e-06 +Rest: -2.7755575615628914e-17 + +- Processing end: 2016-09-27 00:33:40.258886
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/BuildLangModelLogs/LangItalianModel.log
Added
@@ -0,0 +1,162 @@ += Logs of language model for Italian (it) = + +- Generated by BuildLangModel.py +- Started: 2016-09-21 18:43:12.831409 +- Maximum depth: 5 +- Max number of pages: 100 + +== Parsed pages == + +Pieve Ligure (revision 83186252) +010 (prefisso) (revision 76157203) +1000 (revision 83185341) +1143 (revision 70627567) +1162 (revision 70627612) +118 - Emergenza sanitaria (revision 83267411) +1201 (revision 77523243) +1202 (revision 76764411) +1374 (revision 78259457) +1404 (revision 70628069) +1520 (revision 76854924) +1537 (revision 70628296) +1582 (revision 80626188) +1584 (revision 76837051) +1600 (revision 76869356) +1619 (revision 70628455) +1742 (revision 70628675) +1748 (revision 70628682) +1749 (revision 70628684) +1750 (revision 70628690) +1754 (revision 70628697) +1775 (revision 70628734) +1797 (revision 78338823) +1798 (revision 82047236) +1803 (revision 77502534) +1805 (revision 79369853) +1809 (revision 70628789) +1810 (revision 82930218) +1814 (revision 78338825) +1815 (revision 82669615) +1816 (revision 83185384) +1818 (revision 72407239) +1823 (revision 74880156) +1859 (revision 83185401) +1860 (revision 83185403) +1861 (revision 83185412) +1868 (revision 83185430) +1874 (revision 83185441) +1897 (revision 83185267) +1908 (revision 83185631) +1909 (revision 83185630) +1913 (revision 83185626) +1915 (revision 83185625) +1917 (revision 83185270) +1920 (revision 83185621) +1921 (revision 83185619) +1923 (revision 83185616) +1925 (revision 83185614) +1926 (revision 83185612) +1928 (revision 83185610) +1929 (revision 83185609) +1939 (revision 83185598) +1946 (revision 83185590) +1947 (revision 83185589) +1948 (revision 83185587) +1951 (revision 83185584) +1956 (revision 83185478) +1960 (revision 83185487) +1964 (revision 83185493) +1965 (revision 83185494) +1969 (revision 83185500) +1970 (revision 83185503) +1971 (revision 83185505) +1975 (revision 83185510) +1976 (revision 83185513) +1977 (revision 83185514) +1980 (revision 83185518) +1981 (revision 83308867) +1983 (revision 83185524) +1985 (revision 83185526) +1988 (revision 83185280) +1990 (revision 83185531) +1995 (revision 83185538) +1999 (revision 83326325) +2000 (revision 83185544) +2001 (revision 83309058) +2002 (revision 83185545) +2003 (revision 83185546) +2004 (revision 83185283) +2005 (revision 83185285) +2006 (revision 83185547) +2007 (revision 83185549) +2008 (revision 83185551) +2009 (revision 83185552) +2010 (revision 83185287) +2012 (revision 83185289) +712 (revision 70630167) +749 (revision 78272323) +ATP (Provincia di Genova) (revision 82754117) +Abbazia di San Colombano (revision 83062997) +Abbazia di San Fruttuoso (revision 83288120) +Acacia dealbata (revision 83036867) +Acquedotto (revision 82973825) +Affresco (revision 82000422) +Agricoltura (revision 82578266) +Allevamento (revision 82971452) +Altitudine (revision 82971213) +Angelo (revision 82333116) +Anni 1960 (revision 83161222) +Anni 1970 (revision 81663175) +Antica Roma (revision 83125874) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2016-09-21 18:46:08.840718 + +59 characters appeared 823241 times. + +First 34 characters: + 0 Char i: 11.823147778111148 % + 1 Char a: 11.252112078965942 % + 2 Char e: 10.910170897707962 % + 3 Char o: 8.936386793174782 % + 4 Char n: 7.317055394471364 % + 5 Char l: 6.931263141655967 % + 6 Char r: 6.521784021932824 % + 7 Char t: 6.386708145002497 % + 8 Char s: 4.572415610981475 % + 9 Char c: 4.116291584116923 % +10 Char d: 3.9770856893667834 % +11 Char u: 2.8944136650142545 % +12 Char m: 2.762860450342002 % +13 Char p: 2.6809889206198427 % +14 Char g: 2.1493098618751985 % +15 Char v: 1.5369739845318686 % +16 Char b: 1.2855287819727153 % +17 Char f: 0.9932692856648295 % +18 Char z: 0.9664241698360504 % +19 Char h: 0.7159507361756764 % +20 Char q: 0.2416060424590126 % +21 Char k: 0.18876610858788617 % +22 Char à: 0.15596890825408355 % +23 Char y: 0.12462936126844994 % +24 Char è: 0.11600491229178332 % +25 Char w: 0.10628722330398996 % +26 Char x: 0.10312897438295712 % +27 Char j: 0.07555503188009344 % +28 Char ù: 0.05575524056746445 % +29 Char ò: 0.03304014255849745 % +30 Char é: 0.021014502436103158 % +31 Char ì: 0.0191924357508919 % +32 Char á: 0.004737373381549267 % +33 Char ó: 0.003644133370422513 % + +The first 34 characters have an accumulated ratio of 0.9997947138201325. + +872 sequences found. + +First 512 (typical positive ratio): 0.9989484485502651 +Next 512 (512-1024): 1.214711123474171e-06 +Rest: -4.336808689942018e-17 + +- Processing end: 2016-09-21 18:46:08.920456
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/BuildLangModelLogs/LangLatvianModel.log
Added
@@ -0,0 +1,162 @@ += Logs of language model for Latvian (lv) = + +- Generated by BuildLangModel.py +- Started: 2016-09-21 00:16:33.485953 +- Maximum depth: 5 +- Max number of pages: 100 + +== Parsed pages == + +Zigfrīds Anna Meierovics (revision 2546984) +1. Saeima (revision 2511127) +1. Saeimas deputāti (revision 2303859) +1. Saeimas frakcijas (revision 2429725) +1. Saeimas vēlēšanas (revision 2464758) +1887. gads (revision 2583253) +1919. gada Parīzes miera konference (revision 2482078) +1920 (revision 2401222) +1921 (revision 2473337) +1922 (revision 2486819) +1923 (revision 2544643) +1924 (revision 2539361) +1925 (revision 2486795) +22. augusts (revision 2583254) +31. jūlijs (revision 2559648) +5. februāris (revision 2581966) +ASV (revision 2549746) +Agrārā reforma Latvijā (revision 2473423) +Agudas Izrael (Latvija) (revision 2311143) +Aigars Kalvītis (revision 2545858) +Alberts Kviesis (revision 2546934) +Aleksandrs Bočagovs (revision 2329526) +Aleksandrs Dauge (revision 2546805) +Aleksandrs Jaunbērzs (revision 2462254) +Aleksandrs Kerenskis (revision 2461214) +Aleksandrs Millerāns (revision 2309419) +Aleksandrs Neibergs (revision 2491897) +Alfrēds Birznieks (revision 2567317) +Alfrēds Jēkabs Bērziņš (revision 2564068) +Alfrēds Riekstiņš (politiķis) (revision 2586148) +Andrejs Bērziņš (revision 2564283) +Andrejs Kurcijs (revision 2564338) +Andrejs Petrevics (revision 2460269) +Andrejs Sīmanis (revision 2547079) +Andrejs Veckalns (revision 2564224) +Andrievs Niedra (revision 2546988) +Andris Bērziņš (politiķis, 1951) (revision 2218488) +Andris Šķēle (revision 2457423) +Angļu valoda (revision 2447598) +Ansis Buševics (revision 2578312) +Ansis Rudevics (revision 2414854) +Antante (revision 2581862) +Antons Dzenis (revision 2564295) +Antons Laizāns (revision 2467408) +Antons Rubins (1885) (revision 2465396) +Antons Velkme (revision 2564425) +Ants Pīps (revision 2564383) +Apollo (portāls) (revision 2371202) +Apolonija Laurinoviča (revision 2466232) +Aprīļa pučs (revision 2150686) +Apvienotā Karaliste (revision 2566258) +Aristīds Briāns (revision 2536819) +Arons Nuroks (revision 2337085) +Arturs Alberings (revision 2442531) +Arturs Ozols (inženieris) (revision 2491399) +Artūrs Balfūrs (revision 2309461) +Artūrs Vīgants (revision 2461471) +Artūrs Žers (revision 2564230) +Arveds Bergs (revision 2564118) +Arveds Švābe (revision 2586288) +Arvīds Kalniņš (revision 2545254) +Aspazija (revision 2574081) +Augusts Briedis (revision 2546879) +Augusts Kalniņš (revision 2436647) +Augusts Kirhenšteins (revision 2547109) +Austroungārija (revision 2524307) +Autoritatīvā vadība (revision 2385793) +Balfūra nota (revision 2538973) +Baltijas Antante (revision 2541901) +Baltijas pārkrievošana (revision 2570657) +Bermontiāde (revision 2499160) +Bernards Kublinskis (revision 2441386) +Bezpartijiskais nacionālais centrs (revision 2438819) +Beļģija (revision 2579008) +Brestļitovskas miera līgums (revision 2569020) +Brizules muiža (revision 2584564) +Bruno Kalniņš (revision 2566572) +Brīvības piemineklis (revision 2578595) +Bulduru konference (revision 2193449) +Ceire-Cion (revision 2311779) +Celmiņa 1. Ministru kabinets (revision 2112830) +Delfi (portāls) (revision 2544918) +Demokrātiskais Centrs (revision 2113060) +Demokrātu savienība (revision 2179593) +Diena (laikraksts) (revision 2548854) +Donats Bicāns (revision 2479349) +Dubulti (Jūrmala) (revision 2456811) +Durbe (revision 2381790) +Dāvids Komisārs (revision 2574685) +Džovanni Džoliti (revision 2538055) +Ebreju bloks (revision 2311643) +Ebreju nacionāldemokrātu partija (revision 2312288) +Eduards Grantskalns (revision 2565167) +Eduards Jaunzems (revision 2452579) +Eduards Laimiņš (revision 2449521) +Eduards Radziņš (revision 2564393) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2016-09-21 00:19:18.361533 + +55 characters appeared 354745 times. + +First 39 characters: + 0 Char a: 11.905171320244119 % + 1 Char i: 9.3977364022044 % + 2 Char s: 8.224217395594017 % + 3 Char e: 6.367108768270166 % + 4 Char r: 5.854064186951191 % + 5 Char t: 5.831230884156225 % + 6 Char u: 4.939604504644181 % + 7 Char n: 4.463769750102186 % + 8 Char ā: 3.9498794909019157 % + 9 Char l: 3.8030134321836813 % +10 Char o: 3.6296494665182033 % +11 Char k: 3.524785409237621 % +12 Char m: 3.2739009711201 % +13 Char d: 3.177775585279567 % +14 Char v: 3.0046935122411873 % +15 Char p: 2.827101157169234 % +16 Char j: 2.8166711299665956 % +17 Char b: 2.0279355593454453 % +18 Char ī: 1.8855797826607845 % +19 Char g: 1.6146809680192813 % +20 Char z: 1.5343415692962552 % +21 Char ē: 1.4593581304880971 % +22 Char c: 1.2231321089796898 % +23 Char š: 0.8876798827326671 % +24 Char ņ: 0.46596851259355315 % +25 Char f: 0.4203019070036223 % +26 Char ļ: 0.34700982395805435 % +27 Char ū: 0.30162511099522193 % +28 Char h: 0.20070755049401684 % +29 Char ž: 0.18774048964749326 % +30 Char ķ: 0.14207388405756247 % +31 Char ģ: 0.1268516821942522 % +32 Char č: 0.08287643236691145 % +33 Char w: 0.0324176521163089 % +34 Char y: 0.02734358482853881 % +35 Char x: 0.015785987117506943 % +36 Char ö: 0.005074067287770088 % +37 Char é: 0.003946496779376736 % +38 Char q: 0.0031008188980817205 % + +The first 39 characters have an accumulated ratio of 0.9998590536864506. + +970 sequences found. + +First 512 (typical positive ratio): 0.9904102202220861 +Next 512 (512-1024): 0.0018774048964749328 +Rest: -1.734723475976807e-17 + +- Processing end: 2016-09-21 00:19:18.484318
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/BuildLangModelLogs/LangLithuanianModel.log
Added
@@ -0,0 +1,162 @@ += Logs of language model for Lithuanian (lt) = + +- Generated by BuildLangModel.py +- Started: 2016-09-21 00:23:03.857157 +- Maximum depth: 5 +- Max number of pages: 100 + +== Parsed pages == + +Karūna (laivas) (revision 5080379) +1650 (revision 4990868) +1654 (revision 4991037) +1664 (revision 4991048) +1665 (revision 4991050) +1668 (revision 4991052) +1669 (revision 4991053) +1672 (revision 4991056) +1676 (revision 4991060) +1718 (revision 4990914) +1909 (revision 4990667) +1928 (revision 4990262) +1932 (revision 4990613) +1956 (revision 4990635) +1980 (revision 4990655) +Baltijos jūra (revision 5052833) +Burinis laivas (revision 4657401) +Flagmanas (laivas) (revision 5005271) +Grimzlė (revision 4487052) +Kalmaras (Švedija) (revision 4978519) +Karo laivas (revision 4726931) +Karolis XI (revision 4944621) +Karolis XII (revision 4915230) +Kilis (revision 4325533) +Koordinačių sistema (revision 5033980) +Laivo vėliava (revision 4986001) +Liepos 1 d. (revision 4910200) +Nyderlandai (revision 5080140) +Rugpjūčio 10 (revision 4910281) +Varytuvas (revision 4620792) +Vaza (laivas) (revision 5079282) +XVIII a. (revision 4896219) +XVII a. (revision 4768242) +Švedija (revision 5057665) +Švedų kalba (revision 4687559) +1590 (revision 4990983) +1596 (revision 4990989) +1608 (revision 4991000) +1610 (revision 4991002) +1623 m. (revision 4991015) +1634 m. (revision 4991026) +1643 m. (revision 4990870) +1644 m. (revision 4990872) +1645 m. (revision 4990873) +1646 m. (revision 4990874) +1647 m. (revision 4913295) +1648 m. (revision 4990875) +1649 m. (revision 4990876) +1651 m. (revision 4991035) +1652 m. (revision 4991072) +1653 m. (revision 4991036) +1654 m. (revision 4991037) +1655 m. (revision 4991038) +1662 m. (revision 4991046) +1668 m. (revision 4991052) +1677 m. (revision 4991061) +1702 (revision 4990595) +1704 (revision 4990863) +1722 (revision 4990918) +1723 (revision 4990919) +1737 (revision 4990931) +2 tūkstantmetis (revision 4296407) +ATR (revision 5078529) +Abiejų Tautų Respublika (revision 5078529) +Adomas Freitagas (revision 4362991) +Anglų kalba (revision 4911240) +Armėnų kalendorius (revision 4817534) +Bahajų kalendorius (revision 4706296) +Bajorai (revision 5006456) +Berberų kalendorius (revision 4926904) +Birželio 21 (revision 4910142) +Bizantijos kalendorius (revision 4927623) +Budistų kalendorius (revision 4705734) +Dešimtmetis (revision 4296419) +Dominikonai (revision 4921895) +Dominikonų ordinas (revision 4921895) +Džohoro sultonatas (revision 4934526) +Džu Ihai (revision 4991072) +Džu Joulang (revision 4991072) +Emanuelis Vladislovas Tiškevičius Logoiskis (revision 4939239) +Filosofas (revision 5078172) +Gegužės 26 (revision 4910130) +Grafas (titulas) (revision 5008057) +Grigaliaus kalendorius (revision 5000317) +Hebrajų kalendorius (revision 4728592) +Imperatorius Go-Komijas (revision 4907057) +Inocentas X (revision 4905150) +Iraniečių kalendorius (revision 4964854) +Isaac Titsingh (revision 4990745) +Japonija (revision 5035249) +Japonijos imperatorius (revision 4720428) +Japonų kalendorius (revision 4956765) +John Churchill (revision 4903704) +Jonas Kazimieras Vaza (revision 5037754) +Jurgis Kasakauskis (revision 5047829) +Jurgis Kazimieras Ancuta (revision 5059404) +Jurgis Mikalojus Tiškevičius (revision 4939554) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2016-09-21 00:25:34.773941 + +60 characters appeared 353051 times. + +First 38 characters: + 0 Char i: 13.032394753165974 % + 1 Char a: 11.167225131779828 % + 2 Char s: 8.586578143101137 % + 3 Char o: 7.018815978428046 % + 4 Char e: 5.525830545728521 % + 5 Char r: 5.469181506354606 % + 6 Char n: 5.142599794363987 % + 7 Char t: 5.105777918770942 % + 8 Char u: 4.270487833202568 % + 9 Char k: 3.9617505686147325 % +10 Char l: 3.9051015292408184 % +11 Char m: 3.359854525266888 % +12 Char d: 3.0372382460324427 % +13 Char v: 2.7270847554602593 % +14 Char j: 2.4472385009531203 % +15 Char p: 2.329125253858508 % +16 Char g: 1.9427788053284087 % +17 Char ė: 1.5657794482950054 % +18 Char b: 1.5074309377398734 % +19 Char y: 1.2236192504765602 % +20 Char ų: 1.181698961339863 % +21 Char š: 0.9630336693565519 % +22 Char ž: 0.8171623929687212 % +23 Char c: 0.5959478942135839 % +24 Char č: 0.48010060869392807 % +25 Char f: 0.428266737666796 % +26 Char h: 0.42515104050123065 % +27 Char z: 0.4010751987673169 % +28 Char ū: 0.3685020011273159 % +29 Char ą: 0.3526402701026197 % +30 Char į: 0.29004308159444386 % +31 Char ę: 0.14813723796278724 % +32 Char x: 0.08752276583269838 % +33 Char w: 0.059198246145740985 % +34 Char ō: 0.01812769259965274 % +35 Char ö: 0.008780601102956797 % +36 Char é: 0.0076476203154785 % +37 Char q: 0.007364375118608926 % + +The first 38 characters have an accumulated ratio of 0.9996629382157253. + +1016 sequences found. + +First 512 (typical positive ratio): 0.9928710196247589 +Next 512 (512-1024): 0.008171623929687212 +Rest: -4.85722573273506e-17 + +- Processing end: 2016-09-21 00:25:34.935858
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/BuildLangModelLogs/LangMalteseModel.log
Added
@@ -0,0 +1,147 @@ += Logs of language model for Maltese (mt) = + +- Generated by BuildLangModel.py +- Started: 2016-09-21 02:05:23.411546 +- Maximum depth: 5 +- Max number of pages: 100 + +== Parsed pages == + +Unjoni Ewropea (revision 246298) +1951 (revision 229183) +1952 (revision 229184) +1957 (revision 229188) +1958 (revision 229189) +1973 (revision 223536) +1979 (revision 243876) +1981 (revision 205545) +1985 (revision 216368) +1986 (revision 231433) +1990 (revision 237666) +1992 (revision 244087) +1995 (revision 214650) +1 ta' Mejju (revision 245374) +2007 (revision 214851) +2013 (revision 245606) +Albanija (revision 243079) +Awstrija (revision 243627) +Awtonomija (revision 245824) +Ażores (revision 246298) +Bank Ċentrali Ewropew (revision 246298) +Belt kapitali (revision 237400) +Belġju (revision 244363) +Brussell (revision 243311) +Bulgarija (revision 243622) +Danimarka (revision 244419) +De facto (revision 215102) +Estonja (revision 243826) +European Free Trade Association (revision 246298) +Ewropa (revision 244177) +Ex Repubblika Jugoslava tal-Maċedonja (revision 246298) +Federazzjoni (revision 246226) +Finlandja (revision 245824) +Frankfurt (revision 243576) +Franza (revision 244461) +Greċja (revision 244423) +Groenlandja (revision 243829) +Indja (revision 244873) +Islanda (revision 243771) +Isle of Man (revision 246298) +Istitut tal-Unjoni Ewropea għall-Istudji dwar is-Sigurtà (revision 244412) +Italja (revision 246323) +Kilometru kwadru (revision 244871) +Komunitajiet Ewropej (revision 246298) +Komunità Ekonomika Ewropea (revision 246298) +Kroazja (revision 245711) +Kummissjoni Ewropea (revision 243311) +Kunsill Ewropew (revision 246298) +Kunsill tal-Ewropa (revision 243334) +Kunsill tal-Unjoni Ewropea (revision 243311) +Latvja (revision 245746) +Lista ta' pajjiżi skont id-daqs (revision 244419) +Lista ta' pajjiżi skont il-popolazzjoni (revision 246128) +Litwanja (revision 243114) +Liġijiet tal-Unjoni Ewropea (revision 246298) +Lussemburgu (revision 244239) +Lussemburgu (belt) (revision 243587) +Madejra (revision 243625) +Malta (revision 247210) +Montenegro (revision 243930) +Norveġja (revision 243829) +Olanda (revision 243989) +Organizzazzjoni Internazzjonali (revision 246724) +Pajjiżi l-Baxxi (revision 243989) +Pajjiżi membri tal-Unjoni Ewropea (revision 243625) +Pajjiżi ġirien li jdawru l-Unjoni Ewropea (revision 246298) +Parlament Ewropew (revision 243907) +Patt ta' Stabilità u Tkabbir (revision 246298) +Politika agrikola komuni (revision 244363) +Politika reġjonali tal-Unjoni Ewropea (revision 246298) +Polonja (revision 244530) +Portugall (revision 243625) +Relazzjonijiet ta' terzi pajjiżi ma l-UE (revision 246298) +Renju Unit (revision 247318) +Repubblika Federali tal-Ġermanja (revision 244859) +Repubblika tal-Irlanda (revision 243686) +Repubblika Ċeka (revision 246832) +Rumanija (revision 243623) +Segretarjat tal-Parlament Ewropew (revision 246298) +Serbja (revision 243728) +Slovakkja (revision 243831) +Slovenja (revision 244588) +Spanja (revision 246856) +Stati Uniti tal-Amerika (revision 243926) +Stati membri tal-Unjoni Ewropea (revision 243114) +Strasburgu (revision 243503) +Sui generis (revision 247150) +Suq komuni (revision 246298) +Svezja (revision 244871) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2016-09-21 02:07:45.508113 + +48 characters appeared 474337 times. + +First 31 characters: + 0 Char a: 12.326257492036252 % + 1 Char i: 12.069899670487438 % + 2 Char t: 8.064941170518008 % + 3 Char l: 7.795301652622502 % + 4 Char e: 6.615971345267184 % + 5 Char n: 6.128132530247482 % + 6 Char r: 5.579577389071483 % + 7 Char u: 4.376424356522894 % + 8 Char o: 3.8337721915009797 % + 9 Char j: 3.7378488289971057 % +10 Char m: 3.6084049947611088 % +11 Char s: 3.3533120966738834 % +12 Char k: 2.588033402412209 % +13 Char d: 2.3173397816320462 % +14 Char p: 2.0555006250830106 % +15 Char b: 2.017131280081461 % +16 Char f: 2.004692866042497 % +17 Char ħ: 1.6372326004507345 % +18 Char w: 1.4801712706366992 % +19 Char g: 1.4763765002519307 % +20 Char z: 1.3150987588992635 % +21 Char ż: 0.9910675321554084 % +22 Char h: 0.9750451683086075 % +23 Char ġ: 0.7640137708000851 % +24 Char ċ: 0.6723068198348432 % +25 Char x: 0.5892435125237964 % +26 Char v: 0.5668965313690478 % +27 Char q: 0.5647883255997318 % +28 Char c: 0.2759641352034524 % +29 Char à: 0.10730767365817974 % +30 Char y: 0.059029761540845424 % + +The first 31 characters have an accumulated ratio of 0.9994708403519017. + +870 sequences found. + +First 512 (typical positive ratio): 0.9959115850692665 +Next 512 (512-1024): 2.108205769315908e-06 +Rest: -4.423544863740858e-17 + +- Processing end: 2016-09-21 02:07:45.646198
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/BuildLangModelLogs/LangPolishModel.log
Added
@@ -0,0 +1,154 @@ += Logs of language model for Polish (pl) = + +- Generated by BuildLangModel.py +- Started: 2016-09-21 17:06:43.735784 +- Maximum depth: 5 +- Max number of pages: 100 + +== Parsed pages == + +Krasnyj Krym (revision 46884814) +1913 (revision 46708474) +1915 (revision 46743905) +1917 (revision 46559521) +1925 (revision 46809935) +1928 (revision 46875978) +1929 (revision 46760445) +1935 (revision 46487358) +1936 (revision 46874348) +1939 (revision 46789269) +1941 (revision 46856112) +1942 (revision 46851808) +1943 (revision 46768330) +1944 (revision 46866229) +1949 (revision 46882598) +1953 (revision 46437607) +1957 (revision 46591716) +1959 (revision 46255886) +Admirał Butakow (revision 45993412) +Admirał Spiridow (revision 45993412) +Aparat torpedowy (revision 46633263) +Askold (revision 45787848) +Avro 504 (revision 44668646) +Ałmaz (1903) (revision 46472283) +Batumi (revision 46594611) +Bomba głębinowa (revision 46011227) +Brest (revision 45771242) +Burta (revision 45569092) +Cagliari (revision 46235605) +Cesariewicz (revision 40031486) +Czerwona Ukraina (revision 45993524) +Daty nowego i starego porządku (revision 45622575) +Drednot (revision 45789788) +Działo przeciwlotnicze (revision 45160162) +Flota Bałtycka Marynarki Wojennej Rosji (revision 45700667) +Gromoboj (revision 44328986) +Hulk (okręt) (revision 46020688) +II wojna światowa (revision 46871591) +I wojna światowa (revision 46869119) +Imperator Nikołaj I (okręt lotniczy) (revision 45520638) +Imperium Rosyjskie (revision 46604959) +Impierator Nikołaj I (1916) (revision 46534166) +Język rosyjski (revision 46433952) +Kanonierka (revision 41091952) +Kanonierki typu Ardagan (revision 46534166) +Kanonierki typu Bobr (revision 45788694) +Kanonierki typu Chiwiniec (revision 46534166) +Kanonierki typu Groziaszczij (revision 46534166) +Kanonierki typu Mandżur (revision 46534166) +Karabin maszynowy DSzK (revision 45587452) +Karabin maszynowy Vickers 12,7 mm (revision 44572918) +Kocioł parowy (revision 46716473) +Konstrukcyjna linia wodna (revision 37082620) +Kontrtorpedowce typu Biesstrasznyj (revision 46534166) +Kontrtorpedowce typu Brawyj (revision 46534166) +Kontrtorpedowce typu Grozowoj (revision 46534166) +Kontrtorpedowce typu Prytkij (revision 46534166) +Koń mechaniczny (revision 44722357) +Krab (1915) (revision 42791389) +Kronsztad (revision 46425497) +Krążownik lekki (revision 40661490) +Krążownik liniowy (revision 40601776) +Krążownik pancernopokładowy (revision 40055901) +Krążownik pancerny (revision 40324458) +Krążowniki lekkie typu Swietłana (revision 45993412) +Krążowniki liniowe typu Borodino (revision 45990866) +Krążowniki typu Admirał Nachimow (revision 45993521) +Krążowniki typu Bajan (revision 45991279) +Krążowniki typu Diana (revision 45991349) +Krążowniki typu Izumrud (revision 45991349) +Lend-Lease Act (revision 46877263) +Marynarka Wojenna Związku Socjalistycznych Republik Radzieckich (revision 45795993) +Maszyna sterowa (revision 28497888) +Mecidiye (1903) (revision 43956539) +Mila morska (revision 45754209) +Mina morska (revision 45781427) +Morze Czarne (revision 46729213) +Nadbudówka (revision 45292731) +Neapol (revision 46823083) +Niszczyciel (revision 45799132) +Niszczyciele rakietowe projektu 61 (revision 46498775) +Niszczyciele typu Finn (revision 46620140) +Niszczyciele typu Lejtienant Szestakow (revision 46620140) +Niszczyciele typu Ochotnik (revision 46620140) +Niszczyciele typu Ukraina (revision 46620140) +Noworosyjsk (revision 44721836) +Odessa (revision 45629804) +Oerlikon 20 mm (revision 45493862) +Okres międzywojenny (revision 46668249) +Okręt-baza wodnosamolotów (revision 45115462) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2016-09-21 17:21:04.404471 + +78 characters appeared 1159291 times. + +First 37 characters: + 0 Char a: 9.685575062689178 % + 1 Char i: 8.815819324052374 % + 2 Char o: 7.920185699707839 % + 3 Char e: 6.871613770830621 % + 4 Char r: 5.8672067668945935 % + 5 Char n: 5.763608964444647 % + 6 Char s: 4.736688199942896 % + 7 Char k: 4.722196583946568 % + 8 Char z: 4.519227700378939 % + 9 Char w: 4.279512219106333 % +10 Char t: 4.0191806888865695 % +11 Char c: 3.6891513864939864 % +12 Char y: 3.565282573572986 % +13 Char p: 3.0190004062828053 % +14 Char d: 2.851052928039638 % +15 Char l: 2.7930002044352973 % +16 Char m: 2.7530620008263673 % +17 Char u: 2.348504387595522 % +18 Char j: 1.881236031332944 % +19 Char ł: 1.6885320424293815 % +20 Char b: 1.394559260789569 % +21 Char g: 1.3928340684090534 % +22 Char h: 1.163901039514669 % +23 Char ę: 0.8066136975099435 % +24 Char ó: 0.5971753425153823 % +25 Char ą: 0.563275312238256 % +26 Char f: 0.5245447432956868 % +27 Char ż: 0.4545019326467643 % +28 Char ś: 0.39567287247119143 % +29 Char ń: 0.3857530162832283 % +30 Char ć: 0.1397405828217419 % +31 Char v: 0.12455888987320698 % +32 Char ź: 0.10204512930748191 % +33 Char x: 0.05468859846233603 % +34 Char é: 0.020961087423261287 % +35 Char á: 0.01707940456710179 % +36 Char q: 0.011386269711401192 % + +The first 37 characters have an accumulated ratio of 0.9993892818972973. + +1321 sequences found. + +First 512 (typical positive ratio): 0.9894531815946438 +Next 512 (512-1024): 1.7251923805153322e-06 +Rest: 0.0003530230403650733 + +- Processing end: 2016-09-21 17:21:04.878014
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/BuildLangModelLogs/LangPortugueseModel.log
Added
@@ -0,0 +1,166 @@ += Logs of language model for Portuguese (pt) = + +- Generated by BuildLangModel.py +- Started: 2016-09-20 23:44:39.722451 +- Maximum depth: 5 +- Max number of pages: 100 + +== Parsed pages == + +Papagaio-das-mascarenhas (revision 46763149) +Albinismo (revision 46498446) +Alfred Newton (revision 43617011) +Alphonse Milne-Edwards (revision 39740747) +Animalia (revision 46727732) +Asa (revision 46338820) +August von Pelzeln (revision 34726241) +Aves (revision 46728980) +Bico (revision 45311553) +Carl Wilhelm Hahn (revision 45025566) +Carlos Lineu (revision 46625396) +Carolus Linnaeus (revision 46625396) +Cauda (revision 43275401) +Charles Lucien Bonaparte (revision 45529712) +Chordata (revision 46640101) +Cladograma (revision 46700307) +Classe (biologia) (revision 46701409) +Classificação científica (revision 46306288) +Coleção Leverian (revision 45026647) +Comores (revision 46181501) +Coracopsinae (revision 36946101) +Coracopsis nigra (revision 44338845) +Coracopsis vasa (revision 42905822) +Cylindraspis indica (revision 42905410) +Cúlmen (revision 45311553) +Digital object identifier (revision 42172651) +Eclectus roratus (revision 44380798) +Edward Newton (revision 39261469) +Endemismo (revision 45260961) +Epíteto específico (revision 35101647) +Espécie (revision 45685675) +Esquilo-vermelho (revision 43489595) +Estado de conservação (revision 46662839) +Extinção (revision 46526607) +Família (biologia) (revision 46636004) +Filo (revision 46704246) +França (revision 46740839) +François-Nicolas Martinet (revision 43679514) +François Levaillant (revision 40142351) +Fredrik Hasselqvist (revision 44381122) +Fregilupus varius (revision 46555765) +Fumigação (revision 42458244) +George Robert Gray (revision 39047844) +Georges-Louis Leclerc, conde de Buffon (revision 45622418) +Género (biologia) (revision 45296588) +Hermann Schlegel (revision 43137605) +Herpetologista (revision 46207704) +Histoire Naturelle (revision 44293456) +Holótipo (revision 44029660) +Ilha da Reunião (revision 45458206) +Ilha vulcânica (revision 37924535) +Ilhas Mascarenhas (revision 45858660) +Ilhas Molucas (revision 45476933) +International Standard Book Number (revision 46326494) +Jacques Barraband (revision 45007769) +Jean Feuilley (revision 43140791) +Johann Georg Wagler (revision 34585234) +John Gerrard Keulemans (revision 39664498) +Julian Hume (revision 41876605) +Leiolopisma (revision 43997173) +Lionel Walter Rothschild (revision 46022922) +Lista Vermelha da IUCN (revision 46569884) +Lista Vermelha da União Internacional para a Conservação da Natureza e dos Recursos Naturais (revision 46569884) +Lista Vermelha de Espécies Ameaçadas da IUCN (revision 46569884) +Lista de aves extintas (revision 45507420) +Londres (revision 46310311) +Língua inglesa (revision 46609785) +Madagascar (revision 46617630) +Mascarenotus grucheti (revision 43145662) +Mathurin Jacques Brisson (revision 36018826) +Maurício (revision 46723599) +Maximiliano I José da Baviera (revision 46372080) +Melanina (revision 46762903) +Museu Nacional de História Natural (França) (revision 43731807) +Naturhistorisches Museum (revision 46694247) +Nesoenas duboisi (revision 43995805) +Nome científico (revision 46671641) +Nomenclatura binomial (revision 46671641) +Nycticorax duboisi (revision 43816214) +Nível do mar (revision 46414695) +Ordem (biologia) (revision 46360024) +Otto Finsch (revision 42362273) +Papagaio (revision 46738207) +Papagaio-cinzento (revision 46673943) +Papagaio-cinzento-de-maurício (revision 46664408) +Pedro Mascarenhas (c. 1484-1555) (revision 45541977) +Periquito-de-maurício (revision 43010883) +Periquito-de-reunião (revision 43048764) +Peter Mundy (revision 43563846) +Piton des Neiges (revision 45632497) +Pleistoceno (revision 45916874) +Plumagem (revision 34951058) +Ponto quente (revision 45375495) +Porphyrio coerulescens (revision 43672493) +Praslin (revision 40728143) +Psitacídeos (revision 46598835) +Psittaciformes (revision 46598835) +Psittacula (revision 42856453) +Psittaculinae (revision 46760737) +Psittaculini (revision 43015966) +Psittrichasiidae (revision 44385977) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2016-09-20 23:47:27.346826 + +51 characters appeared 558324 times. + +First 38 characters: + 0 Char a: 11.864795351802895 % + 1 Char e: 11.44604208309154 % + 2 Char o: 9.868284365350585 % + 3 Char s: 8.346587286235232 % + 4 Char i: 7.118089138206489 % + 5 Char r: 6.394136737808154 % + 6 Char n: 5.568272186042513 % + 7 Char d: 5.243192125002687 % + 8 Char t: 4.80061756256224 % + 9 Char m: 4.498105042949971 % +10 Char c: 3.9747530107965985 % +11 Char u: 3.7229279056605127 % +12 Char l: 3.207814817202914 % +13 Char p: 2.77562848811801 % +14 Char g: 1.3850380782484721 % +15 Char v: 1.3210967108703908 % +16 Char f: 1.122466524813549 % +17 Char b: 0.9702251739133549 % +18 Char h: 0.9130898904578704 % +19 Char é: 0.7026386112723079 % +20 Char ã: 0.7022803963290133 % +21 Char q: 0.5903382265494588 % +22 Char ç: 0.5856814322866293 % +23 Char í: 0.41391736697688086 % +24 Char x: 0.3913498255493226 % +25 Char á: 0.34567742027926435 % +26 Char z: 0.3170202248156984 % +27 Char ó: 0.22925756370852768 % +28 Char j: 0.20454073262120204 % +29 Char ê: 0.20239144296143458 % +30 Char õ: 0.16155493942585308 % +31 Char y: 0.15080849112701586 % +32 Char w: 0.09241945537000021 % +33 Char ú: 0.08794176857881804 % +34 Char k: 0.08364318925928313 % +35 Char â: 0.07898639499645367 % +36 Char à: 0.06859816164091102 % +37 Char ô: 0.031164700066627977 % + +The first 38 characters have an accumulated ratio of 0.9998137282294869. + +891 sequences found. + +First 512 (typical positive ratio): 0.9953179582313172 +Next 512 (512-1024): 1.7910747164728723e-06 +Rest: 2.42861286636753e-17 + +- Processing end: 2016-09-20 23:47:27.489355
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/BuildLangModelLogs/LangRomanianModel.log
Added
@@ -0,0 +1,153 @@ += Logs of language model for Romanian (ro) = + +- Generated by BuildLangModel.py +- Started: 2016-09-28 18:53:56.086095 +- Maximum depth: 5 +- Max number of pages: 100 + +== Parsed pages == + +The Loving Kind (revision 10166481) +12 ianuarie (revision 10711676) +13 decembrie (revision 9938353) +2007 (revision 10716321) +2008 (revision 10752084) +2009 (revision 10654003) +21 noiembrie (revision 10447643) +25 ianuarie (revision 10228199) +31 ianuarie (revision 10718063) +4 Music (revision 9701591) +Billboard (revision 10505294) +Biology (revision 10112430) +Bulgaria (revision 10481051) +CD (revision 10477531) +Call The Shots (revision 10101027) +Call the Shots (revision 10101027) +Can't Speak French (revision 9721506) +Casă de discuri (revision 10611348) +Channel 4 (revision 7953101) +Chemistry (revision 10112479) +Cheryl Cole (revision 10475016) +Chitară (revision 10468266) +Croația (revision 10737746) +Dance (revision 10231736) +Descărcare digitală (revision 10100743) +Digital Spy (revision 9044016) +Discografia Girls Aloud (revision 10172788) +Estonia (revision 10749810) +Europa (revision 10752724) +Fascination Records (revision 9655292) +Fiona Phillips (revision 5384082) +Gen muzical (revision 10534645) +Girls A Live (revision 10112444) +Girls Aloud (revision 10112446) +Good Morning Television (revision 10166481) +Heat World (revision 10166481) +I'll Stand By You (cântec de Girls Aloud) (revision 10112432) +ITunes (revision 10744174) +I Think We're Alone Now (revision 10112427) +Irlanda (revision 10573806) +Jump (cântec de Girls Aloud) (revision 10112438) +Lady GaGa (revision 10753010) +Life Got Cold (revision 10112437) +Limba engleză (revision 10756676) +Long Hot Summer (revision 10112429) +Love Machine (revision 10112433) +MSN Search (revision 10653298) +MTV (revision 10170766) +Mixed Up (revision 10112443) +Muzică electronică (revision 10608432) +Muzică pop (revision 10740529) +Nadine Coyle (revision 10316187) +Neil Tennant (revision 10499980) +No Good Advice (revision 10112436) +Out Of Control (revision 10112484) +Out of Control (revision 10112484) +Pet Shop Boys (revision 10612741) +Poker Face (revision 10496402) +PopJustice (revision 10625677) +Regatul Unit (revision 10752338) +Regatul Unit al Marii Britanii și Irlandei de Nord (revision 10752338) +Regatul Unit al Marii Britanii și al Irlandei de Nord (revision 10752338) +Republica Irlanda (revision 10573806) +Romanian Top 100 (revision 10736281) +România (revision 10732435) +Sarah Harding (revision 10633651) +Sarah Hearding (revision 10112425) +See the Day (revision 10112431) +Sexy! No No No... (revision 10112425) +Slant Magazine (revision 7697473) +Slovenia (revision 10521499) +Something Kinda Ooooh (revision 10112426) +Sound of the Underground (album) (revision 10112476) +Sound of the Underground (cântec) (revision 10112434) +Tangled Up (revision 10112482) +The Guardian (revision 9752334) +The Paul O'Grady Show (revision 10101027) +The Promise (revision 10166482) +The Show (revision 10112441) +The Sound of Girls Aloud (revision 10112480) +Tonalitate (revision 9966362) +Turneul Out of Control (revision 10112446) +UK Mix (revision 9721468) +UK Singles Chart (revision 10226705) +Ungaria (revision 10737745) +Uniunea Europeană (revision 10751590) +Untouchable (revision 10112410) +Wake Me Up (revision 10112439) +What Will The Neighbours Say? (revision 10112478) +Whole Lotta History (revision 10475020) +Wideboys (revision 10166481) +Wikimedia Commons (revision 9703907) +Xenomania (revision 10112484) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2016-09-28 18:58:13.756622 + +60 characters appeared 883554 times. + +First 33 characters: + 0 Char e: 11.67014127036944 % + 1 Char i: 10.97567324690964 % + 2 Char a: 10.080198833348046 % + 3 Char r: 7.490657050955572 % + 4 Char n: 7.18246988865423 % + 5 Char t: 6.516296683620921 % + 6 Char l: 5.595130574928075 % + 7 Char u: 5.551217016730161 % + 8 Char o: 4.922732509840938 % + 9 Char c: 4.495707110148333 % +10 Char s: 3.8308920563994957 % +11 Char d: 3.590499279048027 % +12 Char m: 2.971408651876399 % +13 Char p: 2.902369294915761 % +14 Char ă: 2.1349006399156134 % +15 Char g: 1.2248261000459508 % +16 Char f: 1.1199089133205216 % +17 Char b: 1.0781457613230203 % +18 Char ț: 1.0323081554721047 % +19 Char ș: 0.9732285745975912 % +20 Char î: 0.97017273420753 % +21 Char v: 0.9693804792915882 % +22 Char z: 0.7369102510995367 % +23 Char h: 0.533413916976212 % +24 Char â: 0.4986678799484808 % +25 Char x: 0.22081276300033725 % +26 Char j: 0.20055367300696958 % +27 Char k: 0.1901411798260208 % +28 Char y: 0.15471606715605385 % +29 Char w: 0.11827234102273318 % +30 Char á: 0.016297815413658927 % +31 Char é: 0.013355154297303842 % +32 Char q: 0.00520624659047438 % + +The first 33 characters have an accumulated ratio of 0.9996661211425673. + +981 sequences found. + +First 512 (typical positive ratio): 0.997762564143313 +Next 512 (512-1024): 1.1317927370596478e-06 +Rest: 3.0357660829594124e-18 + +- Processing end: 2016-09-28 18:58:13.862425
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/BuildLangModelLogs/LangSlovakModel.log
Added
@@ -0,0 +1,158 @@ += Logs of language model for Slovak (sk) = + +- Generated by BuildLangModel.py +- Started: 2016-09-21 13:26:28.712674 +- Maximum depth: 5 +- Max number of pages: 100 + +== Parsed pages == + +Dôkaz (matematika) (revision 6358810) +1825 (revision 6122752) +1839 (revision 6165808) +1847 (revision 5941780) +1852 (revision 5941777) +1878 (revision 6221358) +1955 (revision 6226609) +1976 (revision 6310709) +1983 (revision 6356952) +1993 (revision 6348358) +1995 (revision 6277350) +2012 (revision 6291145) +Adrien-Marie Legendre (revision 6060342) +Algebra (revision 6319238) +Algebraická geometria (revision 5964212) +Algebraická rovnica (revision 5288111) +Algebrické číslo (revision 6106622) +Algoritmus (revision 6286937) +Andrew Wiles (revision 5791970) +Arabi (revision 6044956) +Arabčina (revision 6322514) +Aristoteles (revision 6359959) +Arthur Cayley (revision 6332355) +Axióma (revision 6338092) +Babylonia (revision 6168813) +Bernard Bolzano (revision 6261374) +Boh (revision 6282272) +Bolzanova veta (revision 6345299) +Bytie (revision 5274918) +Byzantská ríša (revision 6359782) +Caroline Blundenová (revision 6358810) +Cauchyho postupnosť (revision 6215169) +Celé číslo (revision 6302805) +Charles Hermite (revision 5751036) +Daniel Marcus (revision 5657431) +David Hilbert (revision 5968866) +Dedukcia (revision 6338099) +Definícia (revision 6106684) +Derivácia (funkcia) (revision 5970574) +Desiatková číselná sústava (revision 5924486) +Diofantická rovnica (revision 6327292) +Dynastia Chan (revision 6342042) +Dôkaz (logika) (revision 5495754) +Dôkaz sporom (revision 5940134) +Dôkaz výpočtom (revision 6358810) +Energia (revision 6277761) +Eric Weisstein (revision 6054413) +Ernst Kummer (revision 6001344) +Európa (revision 6295124) +Experiment (revision 6354302) +Fenomén (filozofia) (revision 5420897) +Filozofia (revision 6296369) +Formula (logika) (revision 3916562) +Formálny dôkaz (revision 6358810) +Formálny jazyk (revision 5623029) +Gabriel Cramer (revision 5923903) +Galoisova teória (revision 6353573) +Gentzenovský kalkul (revision 6358810) +Geometria (revision 5970028) +Geometrický dôkaz (revision 6358810) +Georg Ferdinand Cantor (revision 6186696) +Giordano Bruno (revision 6312876) +Gottlob Frege (revision 5968855) +Gödelova veta o neúplnosti (revision 5323549) +Hardvér (revision 6214401) +Henri Poincaré (revision 6315506) +Hilbertovský kalkul (revision 6358810) +Hmotnosť (revision 5979540) +Hypotéza (revision 5983410) +Idea (revision 5960449) +India (revision 6362189) +Intuícia (revision 5837951) +Jazyk (lingvistika) (revision 6073293) +John Taylor (revision 6355518) +Kardinálne číslo (revision 6090126) +Kenneth Appel (revision 5968422) +Klasická mechanika (revision 6295646) +Konečná množina (revision 5276494) +Konfucianizmus (revision 5968816) +Kresťanstvo (revision 6289571) +Langlandsov program (revision 6088475) +Latinčina (revision 6121105) +Leonhard Euler (revision 6339382) +Lineárna algebra (revision 5473535) +Logická axióma (revision 5495754) +Logický kalkul (revision 1608550) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2016-09-21 13:33:10.330458 + +62 characters appeared 550293 times. + +First 45 characters: + 0 Char o: 8.867094438780795 % + 1 Char a: 8.59705647718579 % + 2 Char e: 8.562347694773512 % + 3 Char n: 6.0867574183207855 % + 4 Char i: 5.828531346028389 % + 5 Char t: 5.366595613609477 % + 6 Char r: 4.977711873492848 % + 7 Char k: 4.264273759615332 % + 8 Char s: 4.257731790155426 % + 9 Char v: 4.117079446767449 % +10 Char l: 3.5979014815743615 % +11 Char d: 3.416361829061972 % +12 Char m: 3.2513588215732345 % +13 Char p: 2.878466562358598 % +14 Char u: 2.5987973679476206 % +15 Char c: 2.419438371921867 % +16 Char z: 2.127412124086623 % +17 Char h: 2.0687161203213558 % +18 Char j: 2.0312815173007834 % +19 Char y: 1.6700194260148686 % +20 Char b: 1.6574806512167153 % +21 Char á: 1.6422160558102683 % +22 Char ý: 1.2564215790497062 % +23 Char í: 1.1326693234331529 % +24 Char č: 0.9473135220691523 % +25 Char é: 0.8913433389121795 % +26 Char ž: 0.7668641978000811 % +27 Char ú: 0.6949025337411161 % +28 Char š: 0.6785476100913513 % +29 Char f: 0.6514711253822963 % +30 Char g: 0.6096752093884531 % +31 Char ť: 0.46375294615777407 % +32 Char ô: 0.4172322744428877 % +33 Char ľ: 0.36053520579036985 % +34 Char x: 0.23114958758334195 % +35 Char ó: 0.2251527822450949 % +36 Char ň: 0.09304134342977287 % +37 Char w: 0.09013380144759246 % +38 Char ä: 0.0694175648245571 % +39 Char ď: 0.06560141597294532 % +40 Char q: 0.01726353051919614 % +41 Char ě: 0.009994675563745132 % +42 Char ĺ: 0.009267790068200032 % +43 Char ö: 0.008904347320427481 % +44 Char ŕ: 0.00599680533824708 % + +The first 45 characters have an accumulated ratio of 0.9998128269848972. + +1181 sequences found. + +First 512 (typical positive ratio): 0.9733303573968434 +Next 512 (512-1024): 1.8172137388627513e-06 +Rest: 0.0003522983638913346 + +- Processing end: 2016-09-21 13:33:10.831531
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/BuildLangModelLogs/LangSloveneModel.log
Added
@@ -0,0 +1,148 @@ += Logs of language model for Slovene (sl) = + +- Generated by BuildLangModel.py +- Started: 2016-09-28 22:00:35.243966 +- Maximum depth: 5 +- Max number of pages: 100 + +== Parsed pages == + +XCOM: Enemy Unknown (revision 4704271) +1UP.com (revision 4547348) +2K Games (revision 4110089) +Android (operacijski sistem) (revision 4619359) +Animator videoigre (revision 4702643) +App Store (revision 3903089) +Artefakt (revision 4484504) +Athlon (revision 4524746) +Avstralazija (revision 4623530) +Avtopsija (revision 4541344) +Bralno-pisalni pomnilnik (revision 4256388) +Civilization (serija) (revision 4645770) +Deus Ex: Human Revolution (revision 4694860) +Digitalna distribucija (revision 4696215) +DirectX (revision 4477913) +Dishonored (revision 4619444) +Edge (magazine) (revision 4690049) +Electronic Entertainment Expo (revision 4538691) +Enoigralska videoigra (revision 4610359) +Eurogamer (revision 4694860) +Evropa (revision 4687833) +Fantasy Flight Games (revision 4649361) +Firaxis Games (revision 4110089) +GameRankings (revision 3934020) +GameSpot (revision 4238015) +GameSpy (revision 4538691) +GameTrailers (revision 4704271) +Game Informer (revision 4704271) +GamesTM (revision 4704271) +Grafična kartica (revision 4257980) +Granata (revision 3859332) +Holograf (revision 4477482) +IGN (revision 4576233) +IOS (revision 4597264) +Igra igranja vlog (revision 4642276) +Igra na deski (revision 4649363) +Igralna konzola (revision 4649866) +Igralni pogon (revision 4622773) +Intel (revision 4626025) +International Standard Book Number (revision 4015087) +Izdelovalec videoigre (revision 3851747) +Joker (revija) (revision 3867772) +Kotaku (revision 4613535) +Kristal (revision 4156234) +Linux (revision 4524740) +Lovec prestreznik (revision 4102792) +MTV (revision 4621758) +Mac OS X (revision 4601645) +Machinima (revision 4601716) +Major (revision 4245802) +Mednarodna različica (revision 4116054) +Metacritic (revision 3934020) +Michael McCann (skladatelj) (revision 4694860) +MicroProse (revision 4382810) +Microsoft Windows (revision 4691357) +Nezemeljsko življenje (revision 4620576) +NowGamer (revision 4704271) +OS X (revision 4601645) +Ognjena ekipa (revision 4694450) +Operacijski sistem (revision 4698515) +Ostrostrelec (revision 4529694) +Pilot (revision 4069093) +PlayStation 3 (revision 4382944) +PlayStation Network (revision 4382944) +PlayStation Vita (revision 3944025) +Pogon igre (revision 4622773) +Procesor (revision 4702518) +Producent videoiger (revision 4599904) +Razvijalec videoiger (revision 4093281) +Računalniška miška (revision 4385579) +Računalniška platforma (revision 4673669) +Severna Amerika (revision 4643798) +Sid Meier (revision 4061487) +Stealth (revision 4618630) +Steam (revision 4696215) +Strateška videoigra (revision 4236795) +Tablični računalnik (revision 4409985) +Take-Two Interactive (revision 4110089) +Telepatija (revision 4481192) +The Bureau: XCOM Declassified (revision 4704271) +The Guardian (revision 3929479) +Trdi disk (revision 4644623) +UFO: Enemy Unknown (revision 4704271) +Unreal Engine (revision 4622773) +Unreal Engine 3 (revision 4622773) +Uporabniški vmesnik (revision 4552473) +Valve Corporation (revision 4110105) +Večigralska videoigra (revision 4618639) +VideoGamer.com (revision 4704271) +Vohunski satelit (revision 4215166) +Vojaška taktika (revision 3970259) +Vojaški čini (revision 4363026) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2016-09-28 22:06:46.133919 + +41 characters appeared 411226 times. + +First 29 characters: + 0 Char a: 10.090315301075321 % + 1 Char e: 9.90477255815537 % + 2 Char i: 9.666703953543793 % + 3 Char o: 9.177921629468953 % + 4 Char n: 7.28309980400072 % + 5 Char r: 5.808241696779873 % + 6 Char s: 4.575586174025961 % + 7 Char t: 4.4963110309173056 % + 8 Char j: 4.343840126840229 % + 9 Char l: 4.2672399118732764 % +10 Char v: 3.802775116359374 % +11 Char p: 3.5216644861949393 % +12 Char k: 3.5136397017698293 % +13 Char d: 3.0387183689747244 % +14 Char m: 2.9487435132992563 % +15 Char z: 2.350775485985808 % +16 Char u: 1.9719083910064055 % +17 Char g: 1.9342162217369525 % +18 Char b: 1.5392995579073308 % +19 Char c: 1.2924766430138173 % +20 Char h: 1.1864522184881305 % +21 Char č: 1.137087635509428 % +22 Char š: 0.6932927392723223 % +23 Char ž: 0.45303555709026183 % +24 Char f: 0.40707542811009034 % +25 Char x: 0.19381070263067024 % +26 Char y: 0.19040624863213904 % +27 Char w: 0.18919037220409216 % +28 Char q: 0.011186063138031156 % + +The first 29 characters have an accumulated ratio of 0.9998978663800442. + +727 sequences found. + +First 512 (typical positive ratio): 0.9983524317161332 +Next 512 (512-1024): 2.4317528560937295e-06 +Rest: -3.859759734048396e-17 + +- Processing end: 2016-09-28 22:06:46.601266
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/BuildLangModelLogs/LangSwedishModel.log
Added
@@ -0,0 +1,151 @@ += Logs of language model for Swedish (sv) = + +- Generated by BuildLangModel.py +- Started: 2016-09-28 22:26:37.221506 +- Maximum depth: 5 +- Max number of pages: 100 + +== Parsed pages == + +Kakapo (revision 36509929) +Akut hotad (revision 32517788) +Aotearoa (revision 36575359) +Art (revision 36771341) +Artepitet (revision 36771341) +Auckland (revision 35752058) +Auktorsnamn (revision 35976965) +BBC (revision 36508743) +Basalomsättning (revision 30567523) +Beilschmiedia tawa (revision 29101923) +Berguv (revision 36295501) +Betesmark (revision 34292168) +Biotop (revision 35528052) +BirdLife International (revision 36124283) +Bonaparte (revision 37325183) +British Museum (revision 36420244) +Bröstben (revision 30602527) +Dacrydium cupressinum (revision 32986501) +Digital object identifier (revision 27637223) +Djur (revision 37300775) +Djurpark (revision 37147093) +Domän (biologi) (revision 33377709) +Don Merton (revision 36509929) +Douglas Adams (revision 36556245) +Däggdjur (revision 37328286) +Ekologisk nisch (revision 33898643) +Ekosystem (revision 36598266) +Endemisk (revision 30647109) +Eukaryoter (revision 37095313) +Evolution (revision 37093592) +Familj (biologi) (revision 30280200) +Femininum (revision 30597527) +Fjäder (biologi) (revision 36364943) +Fjäderdräkt (revision 36364943) +Fladdermöss (revision 37307257) +Flygg (revision 36479633) +Frukter (revision 34088588) +Frö (revision 37333131) +Fågelläte (revision 34034723) +Fåglar (revision 37387306) +Fåglarnas liv (revision 36509929) +Genitiv (revision 37388438) +George Edward Grey (revision 36509929) +George Robert Gray (revision 20426710) +Haasts örn (revision 29175076) +Hauturu/Little Barrier Island (revision 36509929) +Hermelin (revision 36578682) +Hertz (revision 37104488) +Hjortdjur (revision 36493550) +Hund (revision 37351832) +Husdjur (revision 37384850) +Huskatt (revision 32922967) +Hāngi (revision 29609696) +IUCN (revision 30570280) +Iller (revision 30663158) +Infraröd (revision 36770733) +Internationella naturvårdsunionen (revision 30570280) +Jordbruk (revision 37352625) +Kahurangi National Park (revision 35956142) +Kamouflage (revision 36579595) +Kaniner (revision 36877621) +Kapiti Island (revision 37395588) +Katt (revision 36734686) +Kelp (revision 30312471) +Kivier (revision 36373234) +Klass (biologi) (revision 30280201) +Kroppsfett (revision 35066611) +Könsdimorfism (revision 30816932) +Könsfördelning (revision 24769321) +Lamm- och fårkött (revision 36187205) +Lek (fortplantningsbeteende) (revision 30508235) +Mandel (revision 36577529) +Maori (revision 32560474) +Maorier (revision 35862066) +Maoripapegojor (revision 36545138) +Mark Carwardine (revision 20375916) +Markpapegoja (revision 36295722) +Maskulinum (revision 32704551) +Masterton (revision 29859631) +Metrosideros umbellata (revision 29071212) +Milford Sound (revision 20284758) +Morrhår (revision 36533839) +Muskelmage (revision 31196380) +Mustela (revision 20934105) +Mårddjur (revision 37306347) +Māori (revision 32560474) +NHNZ (revision 36509929) +Nattpapegoja (revision 33486517) +Nordön (revision 24810231) +Nya Zeeland (revision 36575359) +Näbb (revision 23648463) +Ollonår (revision 36509929) +Ordning (biologi) (revision 30280196) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2016-09-28 22:29:21.480287 + +48 characters appeared 594415 times. + +First 31 characters: + 0 Char a: 10.070741821791172 % + 1 Char e: 9.737136512369304 % + 2 Char r: 9.110638190489809 % + 3 Char n: 8.378826240925951 % + 4 Char t: 7.481305148759705 % + 5 Char s: 5.828587771169974 % + 6 Char i: 5.359891658184939 % + 7 Char l: 5.173489901836259 % + 8 Char o: 4.694195133029954 % + 9 Char d: 4.597293136949774 % +10 Char k: 3.297359588839447 % +11 Char m: 3.1898589369379975 % +12 Char g: 3.004466576381821 % +13 Char v: 2.2324470277499726 % +14 Char f: 2.1988005013332437 % +15 Char p: 2.06017681249632 % +16 Char u: 2.0499146219392173 % +17 Char ä: 2.0475593650900468 % +18 Char h: 2.028380845032511 % +19 Char å: 1.5443755625278637 % +20 Char c: 1.442594820117258 % +21 Char ö: 1.3515809661600062 % +22 Char b: 1.268642278542769 % +23 Char j: 0.7302978558751041 % +24 Char y: 0.6699023409570755 % +25 Char x: 0.2111319532649748 % +26 Char w: 0.10262190557102362 % +27 Char z: 0.09151855185350302 % +28 Char é: 0.021197311642539303 % +29 Char ā: 0.011103353717520588 % +30 Char q: 0.007570468443764037 % + +The first 31 characters have an accumulated ratio of 0.999936071599808. + +748 sequences found. + +First 512 (typical positive ratio): 0.997323508584682 +Next 512 (512-1024): 1.6823263208364526e-06 +Rest: 1.7780915628762273e-17 + +- Processing end: 2016-09-28 22:29:21.590354
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/charsets/ibm852.py
Added
@@ -0,0 +1,72 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan <jehan@girinstud.io> +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +from codepoints import * + +name = 'IBM852' +aliases = 'CP852' + +language = \ +{ + 'complete': 'bs', 'hr', 'cs', 'de', 'hu', 'pl', 'sr', 'sk', 'sl', + 'hsb', 'dsb', 'tk' , + 'incomplete': 'ro' +} + +# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF # +charmap = \ + + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 8X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,LET, # 9X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,SYM,SYM, # AX + SYM,SYM,SYM,SYM,SYM,LET,LET,LET,LET,SYM,SYM,SYM,SYM,LET,LET,SYM, # BX + SYM,SYM,SYM,SYM,SYM,SYM,LET,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # CX + LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,LET,LET,SYM, # DX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM, # EX + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,LET,LET,SYM,SYM, # FX +
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/charsets/ibm865.py
Added
@@ -0,0 +1,71 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan <jehan@girinstud.io> +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +from codepoints import * + +name = 'IBM865' +aliases = 'CP865', '865', 'CSIBM865' + +language = \ +{ + 'complete': 'no', 'da' , + 'incomplete': +} + +# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF # +charmap = \ + + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 8X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,LET,SYM,SYM, # 9X + LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # AX + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # BX + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # CX + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # DX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM, # EX + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # FX +
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/charsets/iso-8859-10.py
Added
@@ -0,0 +1,73 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan <jehan@girinstud.io> +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +from codepoints import * + +name = 'ISO-8859-10' +aliases = 'ISO_8859-10:1992', 'ISO_8859-10', 'iso-ir-157', + 'csISOLatin6', 'latin6', 'l6' + +language = \ +{ + # Nordic languages. Supersedes ISO-8859-4. + 'complete': 'et', 'lv', 'lt', 'kl', 'saam1281' , + 'incomplete': +} + +# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF # +charmap = \ + + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X + SYM,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,SYM,LET,LET, # AX + SYM,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # BX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # DX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # FX +
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/charsets/iso-8859-13.py
Added
@@ -0,0 +1,72 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan <jehan@girinstud.io> +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +from codepoints import * + +name = 'ISO-8859-13' +aliases = 'csISO885913' + +language = \ +{ + # Designed to cover Baltic languages. + 'complete': 'lv', 'lt' , + 'incomplete': +} + +# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF # +charmap = \ + + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,SYM,LET,SYM,SYM,SYM,SYM,LET, # AX + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,SYM,LET,SYM,SYM,SYM,SYM,LET, # BX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX + LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # DX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX + LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,SYM, # FX +
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/charsets/iso-8859-16.py
Added
@@ -0,0 +1,83 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan <jehan@girinstud.io> +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +from codepoints import * + +# ISO-8859-1 is the full 8-bit range, IANA-defined, superset of ISO/CEI 8859-1. +# It is basically the same as ISO/CEI 8859-1, but with control characters. +# As far as I can see, `iconv` has no support for the ISO/CEI 8859-1 subset, +# so there is no need for us to support it anyway. + +name = 'ISO-8859-16' +aliases = 'ISO_8859-16:2001', 'ISO_8859-16', 'iso-ir-226', + 'csISO885916', 'latin10', 'l10' + +language = \ +{ + # Languages with complete coverage. + # Some languages actually have several alphabets and only one of them is + # compatible with ISO-8859-1 (ex: Kurdish). + # Some don't have a ISO language code (like Leonese, for which I used + # a Glottolog code). + 'complete': 'sq', 'hr', 'hu', 'pl', 'ro', 'sr', 'sl', + 'fr', 'de', 'it', 'ga' , + 'incomplete': +} + +# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF # +charmap = \ + + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X + SYM,LET,LET,LET,SYM,SYM,LET,SYM,LET,SYM,LET,SYM,LET,SYM,LET,LET, # AX + SYM,SYM,LET,LET,LET,SYM,SYM,SYM,LET,LET,LET,SYM,LET,LET,LET,LET, # BX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # DX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # FX +
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/charsets/iso-8859-4.py
Added
@@ -0,0 +1,73 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan <jehan@girinstud.io> +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +from codepoints import * + +name = 'ISO-8859-4' +aliases = 'ISO_8859-2:1988', 'ISO_8859-4', 'iso-ir-110', + 'csISOLatin4', 'latin4', 'l4' + +language = \ +{ + # Nordic languages. Largely superseded by ISO-8859-10. + 'complete': 'et', 'lv', 'lt', 'kl', 'saam1281' , + 'incomplete': +} + +# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF # +charmap = \ + + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X + SYM,LET,LET,LET,SYM,LET,LET,SYM,SYM,LET,LET,LET,LET,SYM,LET,SYM, # AX + SYM,LET,SYM,LET,SYM,LET,LET,SYM,SYM,LET,LET,LET,LET,LET,LET,LET, # BX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX + LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # DX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX + LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,SYM, # FX +
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/charsets/mac-centraleurope.py
Added
@@ -0,0 +1,72 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan <jehan@girinstud.io> +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +from codepoints import * + +name = 'MAC-CENTRALEUROPE' +aliases = + +language = \ +{ + 'complete': 'bs', 'hr', 'cs', 'de', 'hu', 'pl', 'sr', 'sk', 'sl', + 'hsb', 'dsb', 'tk' , + 'incomplete': 'ro' +} + +# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF # +charmap = \ + + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 8X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 9X + SYM,SYM,LET,SYM,SYM,SYM,SYM,LET,SYM,SYM,SYM,LET,SYM,SYM,LET,LET, # AX + LET,LET,SYM,SYM,LET,LET,SYM,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # BX + LET,LET,SYM,SYM,LET,LET,SYM,SYM,SYM,SYM,SYM,LET,LET,LET,LET,LET, # CX + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,LET,LET,LET,SYM,SYM,LET,LET, # DX + LET,LET,SYM,SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM, # FX +
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/charsets/windows-1257.py
Added
@@ -0,0 +1,72 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan <jehan@girinstud.io> +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +from codepoints import * + +name = 'WINDOWS-1257' +aliases = 'CP-1257' + +language = \ +{ + # Designed to support the Estonian, Latvian and Lithuanian languages. + 'complete': 'et', 'lv', 'lt' , + 'incomplete': +} + +# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF # +charmap = \ + + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X + SYM,ILL,SYM,ILL,SYM,SYM,SYM,SYM,ILL,SYM,ILL,SYM,ILL,SYM,SYM,SYM, # 8X + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,ILL,SYM,ILL,SYM,SYM,ILL, # 9X + SYM,ILL,SYM,SYM,SYM,ILL,SYM,SYM,LET,SYM,LET,SYM,SYM,SYM,SYM,LET, # AX + SYM,SYM,SYM,SYM,SYM,LET,SYM,SYM,LET,SYM,LET,SYM,SYM,SYM,SYM,LET, # BX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX + LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # DX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX + LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,SYM, # FX +
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/langs/cs.py
Added
@@ -0,0 +1,80 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan <jehan@girinstud.io> +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +# The human name for the language, in English. +name = 'Czech' +# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise, +# or use another catalog as a last resort. +code = 'cs' +# ASCII characters are also used in French. +use_ascii = True +# The charsets we want to support and create data for. +charsets = 'ISO-8859-2', 'Windows-1250', 'IBM852', 'MAC-CENTRALEUROPE' + +## Optional Properties ## + +# Alphabet characters. +# If use_ascii=True, there is no need to add any ASCII characters. +# If case_mapping=True, there is no need to add several cases of a same +# character (provided Python algorithms know the right cases). +alphabet = 'áčďéěíňóřšťúůýž' +# The starred page which was rewarded on the main page when I created +# the data. +start_pages = 'Sociální fobie' +# give possibility to select another code for the Wikipedia URL. +wikipedia_code = code +# 'a' and 'A' will be considered the same character, and so on. +# This uses Python algorithm to determine upper/lower-case of a given +# character. +case_mapping = True + +# A function to clean content returned by the `wikipedia` python lib, +# in case some unwanted data has been overlooked. +# Note that we are already cleaning away the '=' from the title syntax +# of Wikipedia, as well as double spaces. But sometimes, Wikipedia in +# some language may return weird syntax or UI text which should be +# discarded. If you encounter one of these cases, use this function. +def clean_wikipedia_content(content): + # Do your garbage text cleaning here. + return content
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/script/langs/da.py -> _service:tar_scm:uchardet-0.0.8.tar.xz/script/langs/da.py
Changed
@@ -50,7 +50,7 @@ # ASCII characters are also used in French. use_ascii = True # The charsets we want to support and create data for. -charsets = 'ISO-8859-15', 'ISO-8859-1', 'WINDOWS-1252' +charsets = 'ISO-8859-15', 'ISO-8859-1', 'WINDOWS-1252', 'IBM865' ## Optional Properties ##
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/langs/et.py
Added
@@ -0,0 +1,57 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan <jehan@girinstud.io> +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +name = 'Estonian' +code = 'et' +use_ascii = True +charsets = 'ISO-8859-4', 'ISO-8859-13', 'ISO-8859-15', + 'WINDOWS-1252', 'WINDOWS-1257' + +## Optional Properties ## + +# Alphabet characters. +alphabet = 'äöüõšž' +start_pages = 'Harilik pohl' +wikipedia_code = code +case_mapping = True
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/langs/fi.py
Added
@@ -0,0 +1,60 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan <jehan@girinstud.io> +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +name = 'Finnish' +code = 'fi' +use_ascii = True +charsets = 'ISO-8859-1', 'ISO-8859-4', 'ISO-8859-9', + 'ISO-8859-13', 'ISO-8859-15', 'WINDOWS-1252' + +## Optional Properties ## + +# Alphabet characters. +# 'å' (Swedish o), 'š' and 'ž' are rare enough that I don't want to include them +# here. +alphabet = 'äö' +# Some random high quality page found on the Finnish home page. +start_pages = 'Yhdistynyt kuningaskunta' +wikipedia_code = code +case_mapping = True
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/langs/ga.py
Added
@@ -0,0 +1,60 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan <jehan@girinstud.io> +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +name = 'Irish' +code = 'ga' +# ASCII characters are also used in French. +use_ascii = True +# The charsets we want to support and create data for. +charsets = 'ISO-8859-15', 'ISO-8859-1', 'ISO-8859-9', 'WINDOWS-1252' + +## Optional Properties ## + +# XXX: Irish gaelic also uses sometimes the dotless 'i' but without any +# semantic difference from the dotted 'i'. Only for stylistic reasons. +# So I don't add it in the glyph list. +alphabet = 'áéíóú' +start_pages = 'Tracy Caldwell Dyson' +wikipedia_code = code +case_mapping = True
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/langs/hr.py
Added
@@ -0,0 +1,59 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan <jehan@girinstud.io> +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +name = 'Croatian' +code = 'hr' +use_ascii = True +charsets = 'ISO-8859-2', 'ISO-8859-13', 'ISO-8859-16', + 'Windows-1250', 'IBM852', 'MAC-CENTRALEUROPE' + +## Optional Properties ## + +# Alphabet characters. +alphabet = 'čćđšž' +# The starred page which was rewarded on the main page when I created +# the data. +start_pages = 'Fizika čvrstog stanja' +wikipedia_code = code +case_mapping = True
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/langs/it.py
Added
@@ -0,0 +1,56 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan <jehan@girinstud.io> +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +name = 'Italian' +code = 'it' +use_ascii = True +charsets = 'ISO-8859-1', 'ISO-8859-3', 'ISO-8859-9', + 'ISO-8859-15', 'WINDOWS-1252' + +## Optional Properties ## + +alphabet = 'óéèò' +start_pages = 'Pieve Ligure' +wikipedia_code = code +case_mapping = True
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/langs/lt.py
Added
@@ -0,0 +1,70 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan <jehan@girinstud.io> +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +# The human name for the language, in English. +name = 'Lithuanian' +# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise, +# or use another catalog as a last resort. +code = 'lt' +# ASCII characters are also used. +use_ascii = True +# The charsets we want to support and create data for. +charsets = 'ISO-8859-4', 'ISO-8859-10', 'ISO-8859-13', + +## Optional Properties ## + +# Alphabet characters. +# If use_ascii=True, there is no need to add any ASCII characters. +# If case_mapping=True, there is no need to add several cases of a same +# character (provided Python algorithms know the right cases). +alphabet = 'ąčęėįšųūž' +# The start page. Just taking the page which was in front page the day +# I created the data. +start_pages = 'Karūna (laivas)' +# give possibility to select another code for the Wikipedia URL. +wikipedia_code = code +# 'a' and 'A' will be considered the same character, and so on. +# This uses Python algorithm to determine upper/lower-case of a given +# character. +case_mapping = True
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/langs/lv.py
Added
@@ -0,0 +1,69 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan <jehan@girinstud.io> +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +# The human name for the language, in English. +name = 'Latvian' +# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise, +# or use another catalog as a last resort. +code = 'lv' +# ASCII characters are also used. +use_ascii = True +# The charsets we want to support and create data for. +charsets = 'ISO-8859-4', 'ISO-8859-10', 'ISO-8859-13' + +## Optional Properties ## + +# Alphabet characters. +# If use_ascii=True, there is no need to add any ASCII characters. +# If case_mapping=True, there is no need to add several cases of a same +# character (provided Python algorithms know the right cases). +alphabet = 'āčēģīķļņšūž' +# The start page. Just taking a starred page. +start_pages = 'Zigfrīds Anna Meierovics' +# give possibility to select another code for the Wikipedia URL. +wikipedia_code = code +# 'a' and 'A' will be considered the same character, and so on. +# This uses Python algorithm to determine upper/lower-case of a given +# character. +case_mapping = True
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/langs/mt.py
Added
@@ -0,0 +1,80 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan <jehan@girinstud.io> +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +# The human name for the language, in English. +name = 'Maltese' +# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise, +# or use another catalog as a last resort. +code = 'mt' +# ASCII characters are also used in French. +use_ascii = True +# The charsets we want to support and create data for. +charsets = 'ISO-8859-3' + +## Optional Properties ## + +# Alphabet characters. +# If use_ascii=True, there is no need to add any ASCII characters. +# If case_mapping=True, there is no need to add several cases of a same +# character (provided Python algorithms know the right cases). +alphabet = 'ċġħż' +# The starred page which was rewarded on the main page when I created +# the data. +start_pages = 'Unjoni Ewropea' +# give possibility to select another code for the Wikipedia URL. +wikipedia_code = code +# 'a' and 'A' will be considered the same character, and so on. +# This uses Python algorithm to determine upper/lower-case of a given +# character. +case_mapping = True + +# A function to clean content returned by the `wikipedia` python lib, +# in case some unwanted data has been overlooked. +# Note that we are already cleaning away the '=' from the title syntax +# of Wikipedia, as well as double spaces. But sometimes, Wikipedia in +# some language may return weird syntax or UI text which should be +# discarded. If you encounter one of these cases, use this function. +def clean_wikipedia_content(content): + # Do your garbage text cleaning here. + return content
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/langs/no.py
Added
@@ -0,0 +1,55 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan <jehan@girinstud.io> +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +## Mandatory Properties ## + +name = 'Norwegian' +code = 'no' +use_ascii = True +charsets = 'IBM865', 'ISO-8859-15', 'ISO-8859-1', 'WINDOWS-1252' + +## Optional Properties ## + +# Alphabet characters. +alphabet = 'æøåéìîàêÆØÅ' +# Some pages that should contain most norwegian-norwegian norwegian +start_pages = 'Norsk', 'Saft', 'Hund' +wikipedia_code = code +case_mapping = True
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/langs/pl.py
Added
@@ -0,0 +1,81 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan <jehan@girinstud.io> +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +# The human name for the language, in English. +name = 'Polish' +# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise, +# or use another catalog as a last resort. +code = 'pl' +# ASCII characters are also used in French. +use_ascii = True +# The charsets we want to support and create data for. +charsets = 'ISO-8859-2', 'ISO-8859-13', 'ISO-8859-16', + 'Windows-1250', 'IBM852', 'MAC-CENTRALEUROPE' + +## Optional Properties ## + +# Alphabet characters. +# If use_ascii=True, there is no need to add any ASCII characters. +# If case_mapping=True, there is no need to add several cases of a same +# character (provided Python algorithms know the right cases). +alphabet = 'ąćęłńóśźż' +# The starred page which was rewarded on the main page when I created +# the data. +start_pages = 'Krasnyj Krym' +# give possibility to select another code for the Wikipedia URL. +wikipedia_code = code +# 'a' and 'A' will be considered the same character, and so on. +# This uses Python algorithm to determine upper/lower-case of a given +# character. +case_mapping = True + +# A function to clean content returned by the `wikipedia` python lib, +# in case some unwanted data has been overlooked. +# Note that we are already cleaning away the '=' from the title syntax +# of Wikipedia, as well as double spaces. But sometimes, Wikipedia in +# some language may return weird syntax or UI text which should be +# discarded. If you encounter one of these cases, use this function. +def clean_wikipedia_content(content): + # Do your garbage text cleaning here. + return content
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/langs/pt.py
Added
@@ -0,0 +1,80 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan <jehan@girinstud.io> +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +# The human name for the language, in English. +name = 'Portuguese' +# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise, +# or use another catalog as a last resort. +code = 'pt' +# ASCII characters are also used in French. +use_ascii = True +# The charsets we want to support and create data for. +charsets = 'ISO-8859-15', 'ISO-8859-1', 'WINDOWS-1252', 'ISO-8859-9' + +## Optional Properties ## + +# Alphabet characters. +# If use_ascii=True, there is no need to add any ASCII characters. +# If case_mapping=True, there is no need to add several cases of a same +# character (provided Python algorithms know the right cases). +alphabet = 'áâãàçéêíóôõú' +# The starred page which was rewarded on the main page when I created +# the data. +start_pages = 'Papagaio-das-mascarenhas' +# give possibility to select another code for the Wikipedia URL. +wikipedia_code = code +# 'a' and 'A' will be considered the same character, and so on. +# This uses Python algorithm to determine upper/lower-case of a given +# character. +case_mapping = True + +# A function to clean content returned by the `wikipedia` python lib, +# in case some unwanted data has been overlooked. +# Note that we are already cleaning away the '=' from the title syntax +# of Wikipedia, as well as double spaces. But sometimes, Wikipedia in +# some language may return weird syntax or UI text which should be +# discarded. If you encounter one of these cases, use this function. +def clean_wikipedia_content(content): + # Do your garbage text cleaning here. + return content
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/langs/ro.py
Added
@@ -0,0 +1,65 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan <jehan@girinstud.io> +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +name = 'Romanian' +code = 'ro' +use_ascii = True +charsets = 'ISO-8859-2', 'ISO-8859-16', + 'Windows-1250', 'IBM852' + +## Optional Properties ## + +# Alphabet characters. +# Note: Wikipedia explains that s and t with cedilla (şţ), or even +# bare s and t, were often used in place of s and t with comma (șț) +# because of missing characters in most common encoding at the time. +# It may be worth adding some common_replacement_letters logics in +# the training and models. +# https://en.wikipedia.org/wiki/Romanian_alphabet#ISO_8859 +alphabet = 'ăâîșț' +# The starred page which was rewarded on the main page when I created +# the data. +start_pages = 'The Loving Kind' +wikipedia_code = code +case_mapping = True
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/langs/sk.py
Added
@@ -0,0 +1,80 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan <jehan@girinstud.io> +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +# The human name for the language, in English. +name = 'Slovak' +# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise, +# or use another catalog as a last resort. +code = 'sk' +# ASCII characters are also used in French. +use_ascii = True +# The charsets we want to support and create data for. +charsets = 'ISO-8859-2', 'Windows-1250', 'IBM852', 'MAC-CENTRALEUROPE' + +## Optional Properties ## + +# Alphabet characters. +# If use_ascii=True, there is no need to add any ASCII characters. +# If case_mapping=True, there is no need to add several cases of a same +# character (provided Python algorithms know the right cases). +alphabet = 'áäčďĺľňóŕšťúýž' +# The starred page which was rewarded on the main page when I created +# the data. +start_pages = 'Dôkaz (matematika)' +# give possibility to select another code for the Wikipedia URL. +wikipedia_code = code +# 'a' and 'A' will be considered the same character, and so on. +# This uses Python algorithm to determine upper/lower-case of a given +# character. +case_mapping = True + +# A function to clean content returned by the `wikipedia` python lib, +# in case some unwanted data has been overlooked. +# Note that we are already cleaning away the '=' from the title syntax +# of Wikipedia, as well as double spaces. But sometimes, Wikipedia in +# some language may return weird syntax or UI text which should be +# discarded. If you encounter one of these cases, use this function. +def clean_wikipedia_content(content): + # Do your garbage text cleaning here. + return content
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/langs/sl.py
Added
@@ -0,0 +1,59 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan <jehan@girinstud.io> +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +name = 'Slovene' +code = 'sl' +use_ascii = True +charsets = 'ISO-8859-2', 'ISO-8859-16', + 'Windows-1250', 'IBM852', 'MAC-CENTRALEUROPE' + +## Optional Properties ## + +# Alphabet characters. +alphabet = 'čšž' +# The starred page which was rewarded on the main page when I created +# the data. +start_pages = 'XCOM: Enemy Unknown' +wikipedia_code = code +case_mapping = True
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/script/langs/sv.py
Added
@@ -0,0 +1,56 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan <jehan@girinstud.io> +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +name = 'Swedish' +code = 'sv' +use_ascii = True +charsets = 'ISO-8859-1', 'ISO-8859-4', 'ISO-8859-9', + 'ISO-8859-15', 'WINDOWS-1252' + +## Optional Properties ## + +alphabet = 'åäö' +start_pages = 'Kakapo' +wikipedia_code = code +case_mapping = True
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/CMakeLists.txt -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/CMakeLists.txt
Changed
@@ -9,18 +9,34 @@ JpCntx.cpp LangModels/LangArabicModel.cpp LangModels/LangBulgarianModel.cpp - LangModels/LangRussianModel.cpp + LangModels/LangCroatianModel.cpp + LangModels/LangCzechModel.cpp LangModels/LangEsperantoModel.cpp + LangModels/LangEstonianModel.cpp + LangModels/LangFinnishModel.cpp LangModels/LangFrenchModel.cpp LangModels/LangDanishModel.cpp LangModels/LangGermanModel.cpp LangModels/LangGreekModel.cpp LangModels/LangHungarianModel.cpp LangModels/LangHebrewModel.cpp + LangModels/LangIrishModel.cpp + LangModels/LangItalianModel.cpp + LangModels/LangLithuanianModel.cpp + LangModels/LangLatvianModel.cpp + LangModels/LangMalteseModel.cpp + LangModels/LangPolishModel.cpp + LangModels/LangPortugueseModel.cpp + LangModels/LangRomanianModel.cpp + LangModels/LangRussianModel.cpp + LangModels/LangSlovakModel.cpp + LangModels/LangSloveneModel.cpp + LangModels/LangSwedishModel.cpp LangModels/LangSpanishModel.cpp LangModels/LangThaiModel.cpp LangModels/LangTurkishModel.cpp LangModels/LangVietnameseModel.cpp + LangModels/LangNorwegianModel.cpp nsHebrewProber.cpp nsCharSetProber.cpp nsBig5Prober.cpp @@ -64,6 +80,16 @@ ${UCHARDET_LIBRARY} ${UCHARDET_SOURCES} ) +target_compile_definitions("${UCHARDET_LIBRARY}" PRIVATE BUILDING_UCHARDET) +if(BUILD_SHARED_LIBS) + target_compile_definitions("${UCHARDET_LIBRARY}" PUBLIC UCHARDET_SHARED) +endif() + +target_include_directories(${UCHARDET_LIBRARY} + PUBLIC + "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>" + "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/${PACKAGE_NAME}>" +) if (UCHARDET_STATIC_LIBRARY) add_library( @@ -71,6 +97,13 @@ STATIC ${UCHARDET_SOURCES} ) + target_compile_definitions("${UCHARDET_STATIC_LIBRARY}" PRIVATE BUILDING_UCHARDET) + + target_include_directories(${UCHARDET_STATIC_LIBRARY} + PUBLIC + "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>" + "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/${PACKAGE_NAME}>" + ) endif (UCHARDET_STATIC_LIBRARY) set_target_properties( @@ -101,6 +134,8 @@ install( TARGETS ${UCHARDET_LIBRARY} + EXPORT + UchardetTargets LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ARCHIVE DESTINATION @@ -110,6 +145,8 @@ install( TARGETS ${UCHARDET_LIBRARY} + EXPORT + UchardetTargets RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ARCHIVE DESTINATION @@ -121,6 +158,8 @@ install( TARGETS ${UCHARDET_STATIC_LIBRARY} + EXPORT + UchardetTargets ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} )
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/EUCTWFreq.tab -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/EUCTWFreq.tab
Changed
@@ -57,7 +57,7 @@ #define EUCTW_TYPICAL_DISTRIBUTION_RATIO (float)0.75 //Char to FreqOrder table , -#define EUCTW_TABLE_SIZE 8102 +#define EUCTW_TABLE_SIZE 5376 static const PRInt16 EUCTWCharToFreqOrder = {
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/LangModels/LangArabicModel.cpp -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/LangModels/LangArabicModel.cpp
Changed
@@ -262,4 +262,4 @@ (float)0.9696025116913417, PR_FALSE, "WINDOWS-1256" -}; \ No newline at end of file +};
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/src/LangModels/LangCroatianModel.cpp
Added
@@ -0,0 +1,292 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Croatian *********/ + +/** + * Generated by BuildLangModel.py + * On: 2016-09-25 23:50:27.590137 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Windows_1250_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 20, 15, 2, 22, 17, 21, 1, 7, 9, 10, 12, 4, 3, /* 4X */ + 14, 30, 6, 8, 5, 11, 13, 28, 29, 27, 16,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 20, 15, 2, 22, 17, 21, 1, 7, 9, 10, 12, 4, 3, /* 6X */ + 14, 30, 6, 8, 5, 11, 13, 28, 29, 27, 16,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM,ILL,SYM,SYM,SYM,SYM,ILL,SYM, 23,SYM, 49, 50, 24, 51, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM, 23,SYM, 52, 53, 24, 54, /* 9X */ + SYM,SYM,SYM, 40,SYM, 55,SYM,SYM,SYM,SYM, 56,SYM,SYM,SYM,SYM, 57, /* AX */ + SYM,SYM,SYM, 40,SYM,SYM,SYM,SYM,SYM, 58, 59,SYM, 60,SYM, 61, 62, /* BX */ + 63, 41, 43, 64, 36, 65, 25, 39, 18, 31, 66, 47, 67, 68, 69, 70, /* CX */ + 26, 71, 72, 44, 73, 74, 32,SYM, 75, 76, 48, 77, 33, 78, 79, 80, /* DX */ + 81, 41, 43, 82, 36, 83, 25, 39, 18, 31, 84, 47, 85, 86, 87, 88, /* EX */ + 26, 89, 90, 44, 91, 92, 32,SYM, 93, 94, 48, 95, 33, 96, 97,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_2_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 20, 15, 2, 22, 17, 21, 1, 7, 9, 10, 12, 4, 3, /* 4X */ + 14, 30, 6, 8, 5, 11, 13, 28, 29, 27, 16,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 20, 15, 2, 22, 17, 21, 1, 7, 9, 10, 12, 4, 3, /* 6X */ + 14, 30, 6, 8, 5, 11, 13, 28, 29, 27, 16,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 98,SYM, 40,SYM, 99,100,SYM,SYM, 23,101,102,103,SYM, 24,104, /* AX */ + SYM,105,SYM, 40,SYM,106,107,SYM,SYM, 23,108,109,110,SYM, 24,111, /* BX */ + 112, 41, 43,113, 36,114, 25, 39, 18, 31,115, 47,116,117,118,119, /* CX */ + 26,120,121, 44,122,123, 32,SYM,124,125, 48,126, 33,127,128,129, /* DX */ + 130, 41, 43,131, 36,132, 25, 39, 18, 31,133, 47,134,135,136,137, /* EX */ + 26,138,139, 44,140,141, 32,SYM,142,143, 48,144, 33,145,146,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_16_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 20, 15, 2, 22, 17, 21, 1, 7, 9, 10, 12, 4, 3, /* 4X */ + 14, 30, 6, 8, 5, 11, 13, 28, 29, 27, 16,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 20, 15, 2, 22, 17, 21, 1, 7, 9, 10, 12, 4, 3, /* 6X */ + 14, 30, 6, 8, 5, 11, 13, 28, 29, 27, 16,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,147,148, 40,SYM,SYM, 23,SYM, 23,SYM,149,SYM,150,SYM,151,152, /* AX */ + SYM,SYM, 18, 40, 24,SYM,SYM,SYM, 24, 18,153,SYM, 45, 45,154,155, /* BX */ + 46, 41, 43,156, 36, 25,157, 39, 35, 31, 42, 47,158,159,160,161, /* CX */ + 26,162,163, 44,164,165, 32,166,167,168, 48,169, 33,170,171,172, /* DX */ + 46, 41, 43,173, 36, 25,174, 39, 35, 31, 42, 47,175,176,177,178, /* EX */ + 26,179,180, 44,181,182, 32,183,184,185, 48,186, 33,187,188,189, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Mac_Centraleurope_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 20, 15, 2, 22, 17, 21, 1, 7, 9, 10, 12, 4, 3, /* 4X */ + 14, 30, 6, 8, 5, 11, 13, 28, 29, 27, 16,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 20, 15, 2, 22, 17, 21, 1, 7, 9, 10, 12, 4, 3, /* 6X */ + 14, 30, 6, 8, 5, 11, 13, 28, 29, 27, 16,SYM,SYM,SYM,SYM,CTR, /* 7X */ + 36,190,191, 31,192, 32, 33, 41,193, 18, 36, 18, 25, 25, 31,194, /* 8X */ + 195,196,197,198,199,200,201, 44,202,203, 32, 37, 48,204,205, 33, /* 9X */ + SYM,SYM,206,SYM,SYM,SYM,SYM,207,SYM,SYM,SYM,208,SYM,SYM,209,210, /* AX */ + 211,212,SYM,SYM,213,214,SYM,SYM, 40,215,216,217,218,219,220,221, /* BX */ + 222,223,SYM,SYM,224,225,SYM,SYM,SYM,SYM,SYM,226,227, 37,228, 38, /* CX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 38,229,230,231,SYM,SYM,232,233, /* DX */ + 234, 23,SYM,SYM, 23,235,236, 41,237,238,239, 24, 24,240, 44,241, /* EX */ + 242,243, 48,244,245,246,247,248,249,249,249,249, 40,249,249,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_13_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 20, 15, 2, 22, 17, 21, 1, 7, 9, 10, 12, 4, 3, /* 4X */ + 14, 30, 6, 8, 5, 11, 13, 28, 29, 27, 16,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 20, 15, 2, 22, 17, 21, 1, 7, 9, 10, 12, 4, 3, /* 6X */ + 14, 30, 6, 8, 5, 11, 13, 28, 29, 27, 16,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 34,SYM,249,SYM,SYM,SYM,SYM,249, /* AX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 34,SYM,249,SYM,SYM,SYM,SYM,249, /* BX */ + 249,249,249, 25, 36,249,249,249, 18, 31,249,249,249,249,249,249, /* CX */ + 23,249,249, 44, 38, 37, 32,SYM,249, 40,249,249, 33,249, 24,249, /* DX */ + 249,249,249, 25, 36,249,249,249, 18, 31,249,249,249,249,249,249, /* EX */ + 23,249,249, 44, 38, 37, 32,SYM,249, 40,249,249, 33,249, 24,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Ibm852_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 20, 15, 2, 22, 17, 21, 1, 7, 9, 10, 12, 4, 3, /* 4X */ + 14, 30, 6, 8, 5, 11, 13, 28, 29, 27, 16,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 20, 15, 2, 22, 17, 21, 1, 7, 9, 10, 12, 4, 3, /* 6X */ + 14, 30, 6, 8, 5, 11, 13, 28, 29, 27, 16,SYM,SYM,SYM,SYM,CTR, /* 7X */ + 39, 33, 31, 43, 36,249, 25, 39, 40, 47,249,249,249,249, 36, 25, /* 8X */ + 31,249,249,249, 32,249,249,249,249, 32, 33,249,249, 40,SYM, 18, /* 9X */ + 41,249, 44, 48,249,249, 24, 24,249,249,SYM,249, 18,249,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 41, 43,249,249,SYM,SYM,SYM,SYM,249,249,SYM, /* BX */ + SYM,SYM,SYM,SYM,SYM,SYM,249,249,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */ + 26, 26,249, 47,249,249,249,249,249,SYM,SYM,SYM,SYM,249,249,SYM, /* DX */ + 44,249,249,249,249,249, 23, 23,249, 48,249,249,249,249,249,SYM, /* EX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,249,249,249,SYM,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 712 + * First 512 sequences: 0.9989731099787131 + * Next 512 sequences (512-1024): 0.0010268900212868262 + * Rest: 3.7513395167998453e-17 + * Negative sequences: TODO + */ +static const PRUint8 CroatianLangModel = +{
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/src/LangModels/LangCzechModel.cpp
Added
@@ -0,0 +1,281 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Czech *********/ + +/** + * Generated by BuildLangModel.py + * On: 2016-09-21 03:28:11.733089 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Windows_1250_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 4X */ + 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 6X */ + 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM,ILL,SYM,SYM,SYM,SYM,ILL,SYM, 29,SYM, 46, 38, 26, 47, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM, 29,SYM, 46, 38, 26, 48, /* 9X */ + SYM,SYM,SYM, 49,SYM, 50,SYM,SYM,SYM,SYM, 51,SYM,SYM,SYM,SYM, 52, /* AX */ + SYM,SYM,SYM, 53,SYM,SYM,SYM,SYM,SYM, 54, 55,SYM, 45,SYM, 45, 56, /* BX */ + 57, 18, 58, 59, 42, 60, 61, 62, 25, 24, 63, 64, 23, 11, 65, 39, /* CX */ + 66, 67, 35, 37, 68, 69, 41,SYM, 27, 31, 33, 70, 43, 28, 71, 72, /* DX */ + 73, 18, 74, 75, 42, 76, 77, 78, 25, 24, 79, 80, 23, 11, 81, 39, /* EX */ + 82, 83, 35, 37, 84, 85, 41,SYM, 27, 31, 33, 86, 43, 28, 87,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Mac_Centraleurope_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 4X */ + 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 6X */ + 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,CTR, /* 7X */ + 42, 88, 89, 24, 90, 41, 43, 18, 91, 25, 42, 25, 92, 93, 24, 94, /* 8X */ + 95, 39, 11, 39, 44, 44, 96, 37, 97, 98, 41, 99, 33, 23, 23, 43, /* 9X */ + SYM,SYM,100,SYM,SYM,SYM,SYM,101,SYM,SYM,SYM,102,SYM,SYM,103,104, /* AX */ + 105,106,SYM,SYM,107,108,SYM,SYM,109,110,111, 45, 45,112,113,114, /* BX */ + 115,116,SYM,SYM,117, 35,SYM,SYM,SYM,SYM,SYM, 35,118,119,120,121, /* CX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,122,123,124, 27,SYM,SYM, 27,125, /* DX */ + 126, 29,SYM,SYM, 29, 46, 46, 18, 38, 38, 11, 26, 26,127, 37,128, /* EX */ + 129, 31, 33, 31,130,131,132,133, 28, 28,134,135,136,137,138,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Ibm852_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 4X */ + 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 6X */ + 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,CTR, /* 7X */ + 139, 43, 24,140, 42, 31,141,142,143,144,145,146,147,148, 42,149, /* 8X */ + 24,150,151,152, 41, 45, 45, 46, 46, 41, 43, 38, 38,153,SYM, 25, /* 9X */ + 18, 11, 37, 33,154,155, 26, 26,156,157,SYM,158, 25,159,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 18,160, 23,161,SYM,SYM,SYM,SYM,162,163,SYM, /* BX */ + SYM,SYM,SYM,SYM,SYM,SYM,164,165,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */ + 166,167, 39,168, 39, 35, 11,169, 23,SYM,SYM,SYM,SYM,170, 31,SYM, /* DX */ + 37,171,172,173,174, 35, 29, 29,175, 33,176,177, 28, 28,178,SYM, /* EX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,179, 27, 27,SYM,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_2_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 4X */ + 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 6X */ + 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,180,SYM,181,SYM, 45, 46,SYM,SYM, 29,182, 38,183,SYM, 26,184, /* AX */ + SYM,185,SYM,186,SYM, 45, 46,SYM,SYM, 29,187, 38,188,SYM, 26,189, /* BX */ + 190, 18,191,192, 42,193,194,195, 25, 24,196,197, 23, 11,198, 39, /* CX */ + 199,200, 35, 37,201,202, 41,SYM, 27, 31, 33,203, 43, 28,204,205, /* DX */ + 206, 18,207,208, 42,209,210,211, 25, 24,212,213, 23, 11,214, 39, /* EX */ + 215,216, 35, 37,217,218, 41,SYM, 27, 31, 33,219, 43, 28,220,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 1025 + * First 512 sequences: 0.9786035192432675 + * Next 512 sequences (512-1024): 0.02139445610866691 + * Rest: 2.0246480655940202e-06 + * Negative sequences: TODO + */ +static const PRUint8 CzechLangModel = +{ + 2,2,3,2,3,3,3,3,3,3,3,2,3,3,3,3,3,3,2,3, + 2,3,3,0,0,3,3,3,0,2,3,0,3,0,3,2,2,0,2,0,0, + 3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,2,3, + 2,3,3,0,0,3,3,3,0,3,3,2,3,2,3,2,2,2,2,2,2, + 3,3,3,3,3,3,3,2,0,2,3,3,3,3,3,3,2,3,3,3, + 3,2,2,3,3,2,2,0,3,2,3,3,3,0,2,0,0,2,0,0,2, + 3,3,3,2,2,3,3,3,3,3,3,0,3,3,3,3,3,3,0,3, + 3,3,3,0,0,3,3,3,0,3,3,0,3,0,3,2,2,0,2,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,3, + 0,2,3,0,2,3,3,2,0,3,3,0,3,0,2,2,2,2,2,0,2, + 3,3,3,3,3,2,2,3,2,3,3,3,3,3,2,2,2,3,3,3, + 3,2,2,3,3,2,0,3,3,3,0,3,2,0,0,2,2,2,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,2, + 3,2,3,0,2,2,0,0,2,0,2,2,2,2,0,2,2,0,2,0,0, + 3,3,3,3,3,2,2,0,2,3,3,3,3,3,2,3,0,2,3,3, + 3,2,2,3,3,2,2,2,3,3,0,3,0,0,0,2,0,2,0,0,0, + 3,3,3,3,3,3,3,0,2,3,3,3,2,3,2,2,2,2,3,0, + 3,2,2,3,2,2,0,3,2,2,2,3,2,0,2,2,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,2,2,2,3,3,2,3,3,2,3,3, + 3,0,3,0,3,3,2,0,3,2,2,3,3,0,0,2,2,2,2,2,2, + 3,3,3,3,3,3,2,2,2,2,2,3,3,3,2,2,2,2,3,3, + 3,0,2,0,3,2,2,0,3,3,2,3,2,0,0,2,0,2,0,0,0, + 0,2,3,0,2,3,3,3,3,3,3,2,3,0,3,2,3,3,0,3, + 0,3,2,0,0,3,3,2,0,2,0,0,2,0,0,0,0,0,2,0,0, + 3,3,3,3,3,3,2,3,0,3,3,0,2,3,3,3,2,2,3,2, + 3,2,3,0,3,2,2,2,3,0,2,3,2,0,0,0,0,2,0,0,0, + 2,2,3,3,3,3,3,3,3,3,3,0,3,2,3,3,3,3,3,3, + 2,3,3,0,0,3,3,2,0,3,2,0,2,0,2,2,2,0,2,2,0, + 3,3,3,3,3,3,2,2,2,2,2,3,3,2,2,3,2,3,2,2, + 2,0,2,0,2,0,0,0,0,0,2,2,0,0,0,2,0,0,0,0,2, + 3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,2,3,2,3,2, + 3,0,2,3,3,2,2,2,2,2,2,3,2,0,2,2,2,0,0,0,0, + 3,3,3,3,3,2,2,0,3,3,3,3,2,3,2,2,2,2,3,2, + 3,2,3,3,3,2,3,2,2,2,2,3,2,0,0,2,0,2,0,0,0, + 3,3,3,3,3,3,2,3,2,3,3,2,2,3,2,2,2,0,3,0, + 3,2,2,0,2,2,2,2,3,0,2,2,0,0,0,0,2,2,2,0,0, + 0,0,3,0,0,3,3,3,2,3,3,0,3,0,3,3,3,3,0,3, + 0,2,2,0,0,2,3,2,0,3,2,0,0,0,0,2,0,0,0,2,0, + 3,3,3,3,3,3,2,3,3,2,3,3,3,3,2,3,3,3,3,2, + 3,2,2,0,2,2,0,2,2,2,2,2,0,2,0,2,0,2,0,0,0, + 2,2,3,2,2,3,3,3,3,2,3,2,3,2,3,2,3,3,0,3, + 0,2,3,0,0,2,3,2,0,3,2,0,2,2,2,0,0,0,2,0,0,
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/LangModels/LangDanishModel.cpp -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/LangModels/LangDanishModel.cpp
Changed
@@ -41,7 +41,7 @@ /** * Generated by BuildLangModel.py - * On: 2016-02-19 17:56:42.163975 + * On: 2022-11-30 19:41:17.519380 **/ /* Character Mapping Table: @@ -67,18 +67,18 @@ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ - SYM, 4, 15, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 4X */ - 17, 29, 1, 6, 3, 16, 14, 25, 27, 20, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ - SYM, 4, 15, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 6X */ - 17, 29, 1, 6, 3, 16, 14, 25, 27, 20, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM, 4, 16, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 4X */ + 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 4, 16, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 6X */ + 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ - SYM,SYM,SYM,SYM,SYM,SYM, 39,SYM, 39,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ - SYM,SYM,SYM,SYM, 53, 42,SYM,SYM, 54,SYM,SYM,SYM, 55, 56, 57,SYM, /* BX */ - 58, 33, 40, 35, 32, 21, 22, 38, 41, 28, 49, 45, 59, 34, 60, 50, /* CX */ - 43, 47, 51, 36, 52, 61, 30,SYM, 19, 62, 37, 44, 31, 46, 63, 48, /* DX */ - 64, 33, 40, 35, 32, 21, 22, 38, 41, 28, 49, 45, 65, 34, 66, 50, /* EX */ - 43, 47, 51, 36, 52, 67, 30,SYM, 19, 68, 37, 44, 31, 46, 69, 70, /* FX */ + SYM,SYM,SYM,SYM,SYM,SYM, 50,SYM, 50,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM, 60, 57,SYM,SYM, 61,SYM,SYM,SYM, 43, 43, 62,SYM, /* BX */ + 39, 32, 44, 53, 36, 21, 20, 42, 38, 28, 63, 46, 64, 35, 47, 52, /* CX */ + 31, 48, 58, 29, 49, 59, 34,SYM, 19, 65, 37, 66, 33, 40, 55, 41, /* DX */ + 39, 32, 44, 53, 36, 21, 20, 42, 38, 28, 67, 46, 68, 35, 47, 52, /* EX */ + 31, 48, 58, 29, 49, 59, 34,SYM, 19, 69, 37, 70, 33, 40, 55, 71, /* FX */ }; /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ @@ -88,18 +88,18 @@ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ - SYM, 4, 15, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 4X */ - 17, 29, 1, 6, 3, 16, 14, 25, 27, 20, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ - SYM, 4, 15, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 6X */ - 17, 29, 1, 6, 3, 16, 14, 25, 27, 20, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM, 4, 16, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 4X */ + 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 4, 16, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 6X */ + 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ - SYM,SYM,SYM,SYM,SYM, 42,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ - 71, 33, 40, 35, 32, 21, 22, 38, 41, 28, 49, 45, 72, 34, 73, 50, /* CX */ - 43, 47, 51, 36, 52, 74, 30,SYM, 19, 75, 37, 44, 31, 46, 76, 48, /* DX */ - 77, 33, 40, 35, 32, 21, 22, 38, 41, 28, 49, 45, 78, 34, 79, 50, /* EX */ - 43, 47, 51, 36, 52, 80, 30,SYM, 19, 81, 37, 44, 31, 46, 82, 83, /* FX */ + SYM,SYM,SYM,SYM,SYM, 57,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 39, 32, 44, 53, 36, 21, 20, 42, 38, 28, 72, 46, 73, 35, 47, 52, /* CX */ + 31, 48, 58, 29, 49, 59, 34,SYM, 19, 74, 37, 75, 33, 40, 55, 41, /* DX */ + 39, 32, 44, 53, 36, 21, 20, 42, 38, 28, 76, 46, 77, 35, 47, 52, /* EX */ + 31, 48, 58, 29, 49, 59, 34,SYM, 19, 78, 37, 79, 33, 40, 55, 80, /* FX */ }; /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ @@ -109,61 +109,83 @@ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ - SYM, 4, 15, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 4X */ - 17, 29, 1, 6, 3, 16, 14, 25, 27, 20, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ - SYM, 4, 15, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 6X */ - 17, 29, 1, 6, 3, 16, 14, 25, 27, 20, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ - SYM,ILL,SYM, 84,SYM,SYM,SYM,SYM,SYM,SYM, 39,SYM, 85,ILL, 86,ILL, /* 8X */ - ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 39,SYM, 87,ILL, 88, 89, /* 9X */ + SYM, 4, 16, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 4X */ + 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 4, 16, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 6X */ + 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM, 81,SYM,SYM,SYM,SYM,SYM,SYM, 50,SYM, 43,ILL, 82,ILL, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 50,SYM, 43,ILL, 83, 84, /* 9X */ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ - SYM,SYM,SYM,SYM,SYM, 42,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ - 90, 33, 40, 35, 32, 21, 22, 38, 41, 28, 49, 45, 91, 34, 92, 50, /* CX */ - 43, 47, 51, 36, 52, 93, 30,SYM, 19, 94, 37, 44, 31, 46, 95, 48, /* DX */ - 96, 33, 40, 35, 32, 21, 22, 38, 41, 28, 49, 45, 97, 34, 98, 50, /* EX */ - 43, 47, 51, 36, 52, 99, 30,SYM, 19,100, 37, 44, 31, 46,101,102, /* FX */ + SYM,SYM,SYM,SYM,SYM, 57,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 39, 32, 44, 53, 36, 21, 20, 42, 38, 28, 85, 46, 86, 35, 47, 52, /* CX */ + 31, 48, 58, 29, 49, 59, 34,SYM, 19, 87, 37, 88, 33, 40, 55, 41, /* DX */ + 39, 32, 44, 53, 36, 21, 20, 42, 38, 28, 89, 46, 90, 35, 47, 52, /* EX */ + 31, 48, 58, 29, 49, 59, 34,SYM, 19, 91, 37, 92, 33, 40, 55, 93, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Ibm865_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 4, 16, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 4X */ + 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 4, 16, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 6X */ + 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ + 42, 33, 28, 44, 36, 39, 21, 42, 94, 46, 38, 52, 47, 95, 36, 21, /* 8X */ + 28, 20, 20, 49, 34, 58, 96, 97, 98, 34, 33, 19,SYM, 19,SYM,SYM, /* 9X */ + 32, 35, 29, 37, 48, 48,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* DX */ + 45, 41, 99, 56,100,101, 57, 54,102,103,104,105,106,107, 51,SYM, /* EX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* FX */ }; /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ /* Model Table: - * Total sequences: 964 - * First 512 sequences: 0.9968082796759031 - * Next 512 sequences (512-1024): 0.0031917203240968304 - * Rest: 3.903127820947816e-17 + * Total sequences: 1065 + * First 512 sequences: 0.9958348814328518 + * Next 512 sequences (512-1024): 0.0041324290837536455 + * Rest: 3.268948339453948e-05 * Negative sequences: TODO */ static const PRUint8 DanishLangModel = { - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,3,3,3,2,3,0,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,2,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,0, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,0,2,3,3,3,3,3,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,2,3,3,2,3,3,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,0,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,0,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,0,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,3,3,3,2,2,0,0, - 3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,2,3,3,3,3,3,3,2,2,2,2,3,2, - 3,3,3,3,3,3,3,2,3,3,2,3,3,2,3,2,3,2,3,3,3,3,3,2,2,2,2,2,0,0, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,2,3,0, - 3,3,3,3,3,3,3,2,3,3,3,2,2,3,3,3,3,2,3,3,3,3,3,3,2,2,2,2,2,0, - 3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,3,2,2,2,2,3,3,3,2,2,0,0,2,0, - 3,3,3,3,3,3,3,2,3,3,2,2,2,2,2,3,3,2,2,3,3,3,3,3,2,2,0,0,2,0, - 3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,2,3,3,2,3,0,2,2,3,2,3,3,0,2, - 3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,3,3,3,3,2,3,3,2,2,0,2,0,2,0, - 3,3,3,3,3,3,2,2,3,3,2,2,3,2,3,2,3,2,2,3,3,3,3,3,2,3,2,2,2,0, - 3,3,3,3,2,2,3,3,3,2,3,3,3,2,3,3,0,2,2,2,2,0,0,3,0,0,2,0,0,0, - 3,3,3,3,3,2,3,3,3,3,3,3,3,2,3,3,3,3,2,2,2,0,0,0,2,2,2,0,0,0, - 3,3,3,3,2,0,3,3,3,2,3,3,2,2,3,3,0,2,2,2,0,0,0,0,0,0,0,0,0,0, - 2,3,3,3,0,3,3,3,3,2,3,3,3,3,3,3,2,2,2,0,0,0,0,0,2,0,0,0,0,0, - 3,3,2,3,3,3,3,3,3,3,2,2,2,2,2,2,3,2,2,3,3,2,3,2,2,0,0,0,0,0, - 3,3,2,3,3,3,2,2,3,3,2,3,2,2,0,2,3,2,3,0,3,0,0,2,3,2,2,0,2,2, - 3,2,2,2,3,3,2,2,2,3,0,2,2,2,0,2,2,0,2,0,2,0,0,0,2,2,2,0,0,0, - 3,2,2,2,3,3,2,2,0,3,0,2,2,0,0,2,2,2,2,2,2,0,0,2,2,0,2,0,0,0, - 3,2,0,2,2,3,2,0,2,2,0,0,2,2,2,2,2,2,2,2,0,0,0,0,2,2,0,0,2,0, - 2,3,2,2,2,0,2,2,2,2,2,2,2,0,2,2,0,2,0,0,0,0,0,0,2,0,0,0,0,0, - 0,0,0,0,3,2,2,2,2,2,0,0,0,0,2,2,3,0,2,0,0,0,0,0,0,0,0,0,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,3,3,3,3,2,3,0,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,2,2, + 3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,1,2,2,1, + 3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,0,2,2,3,3,3,3,3,3,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,3,3,2,3,3,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,0,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,0,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,0,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,1,2,3,3,3,3,2,2,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,2,2,0,0,2,0,0, + 3,3,3,3,3,3,3,2,3,3,2,3,3,3,3,3,2,2,3,3,3,3,3,3,1,2,2,1,2,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,3,2,3,3,3,3,3,3,2,0,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,2,2,3,2,2,3,2,2,2,3,3,3,2,3,2,0,0,0,1,1,0, + 3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,2,3,3,3,2,2,0,2,2,3,2,2,3,0,0,2, + 3,3,3,3,3,3,3,2,3,3,2,2,2,2,2,3,3,2,2,3,3,3,3,3,2,0,2,0,2,2,0, + 3,3,3,3,3,3,3,3,3,3,3,2,2,3,2,3,3,3,3,3,3,3,3,2,2,0,1,0,2,2,0, + 3,3,3,3,3,3,3,2,3,3,1,2,3,2,3,3,2,2,2,3,3,3,3,3,2,3,0,0,2,2,1, + 3,3,3,3,0,2,3,3,3,2,3,3,3,2,3,2,3,2,2,0,0,0,2,3,0,2,1,0,0,0,0, + 2,3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,2,2,0,0,0,0,0,2,0,0,0,0,0,0, + 3,3,3,3,0,0,3,3,3,2,2,3,2,2,3,0,3,2,2,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,2,3,3,3,3,3,3,3,2,3,2,3,3,3,2,0,0,2,2,2,2,2,0,0,0,0, + 3,3,3,2,3,3,3,3,3,3,2,2,2,2,2,3,2,2,2,3,3,2,3,0,2,0,0,0,0,2,0, + 3,3,2,3,3,3,2,2,3,3,2,3,2,2,2,3,2,2,3,0,2,0,3,2,3,0,2,2,2,2,2, + 3,2,2,2,3,3,2,2,2,3,0,2,2,2,0,2,2,0,2,0,2,0,2,2,2,2,2,0,0,0,2, + 3,2,2,2,3,3,2,2,2,3,2,2,2,2,0,2,2,2,2,0,0,0,2,2,0,2,3,0,0,0,0, + 3,2,1,2,2,2,2,2,2,2,0,2,1,2,2,0,0,2,0,0,0,0,2,0,2,2,0,2,0,0,0, + 2,2,3,2,2,0,2,2,2,2,2,0,2,2,2,2,2,1,2,0,0,0,0,0,1,0,2,0,0,0,0, + 0,3,2,2,2,0,2,0,2,0,2,2,0,2,2,0,0,2,2,0,0,0,0,0,0,2,0,0,0,0,0, + 0,0,0,0,2,1,0,0,0,0,0,0,0,0,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, }; @@ -171,8 +193,8 @@ { Iso_8859_15_CharToOrderMap, DanishLangModel, - 30,
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/LangModels/LangEsperantoModel.cpp -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/LangModels/LangEsperantoModel.cpp
Changed
@@ -138,4 +138,4 @@ (float)0.9942980632768038, PR_FALSE, "ISO-8859-3" -}; \ No newline at end of file +};
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/src/LangModels/LangEstonianModel.cpp
Added
@@ -0,0 +1,263 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Estonian *********/ + +/** + * Generated by BuildLangModel.py + * On: 2016-09-26 23:47:54.476870 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Iso_8859_4_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 23, 10, 2, 22, 15, 16, 1, 17, 8, 5, 12, 7, 9, /* 4X */ + 14, 28, 11, 3, 4, 6, 13, 27, 26, 25, 30,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 23, 10, 2, 22, 15, 16, 1, 17, 8, 5, 12, 7, 9, /* 6X */ + 14, 28, 11, 3, 4, 6, 13, 27, 26, 25, 30,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 55, 56, 57,SYM, 58, 59,SYM,SYM, 29, 45, 60, 61,SYM, 32,SYM, /* AX */ + SYM, 62,SYM, 63,SYM, 64, 65,SYM,SYM, 29, 45, 66, 67, 68, 32, 69, /* BX */ + 37, 43, 70, 71, 18, 44, 47, 72, 73, 33, 74, 75, 76, 36, 77, 39, /* CX */ + 78, 79, 31, 80, 81, 20, 24,SYM, 38, 82, 52, 83, 21, 84, 34, 85, /* DX */ + 37, 43, 86, 87, 18, 44, 47, 88, 89, 33, 90, 91, 92, 36, 93, 39, /* EX */ + 94, 95, 31, 96, 97, 20, 24,SYM, 38, 98, 52, 99, 21,100, 34,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Windows_1252_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 23, 10, 2, 22, 15, 16, 1, 17, 8, 5, 12, 7, 9, /* 4X */ + 14, 28, 11, 3, 4, 6, 13, 27, 26, 25, 30,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 23, 10, 2, 22, 15, 16, 1, 17, 8, 5, 12, 7, 9, /* 6X */ + 14, 28, 11, 3, 4, 6, 13, 27, 26, 25, 30,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM,101,SYM,SYM,SYM,SYM,SYM,SYM, 29,SYM,102,ILL, 32,ILL, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 29,SYM,103,ILL, 32,104, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 50,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 40, 43,105,106, 18, 44, 47, 48, 41, 33,107,108, 35, 36,109,110, /* CX */ + 46,111, 53, 42,112, 20, 24,SYM, 38, 54, 52,113, 21,114,115,116, /* DX */ + 40, 43,117,118, 18, 44, 47, 48, 41, 33,119,120, 35, 36,121,122, /* EX */ + 46,123, 53, 42,124, 20, 24,SYM, 38, 54, 52,125, 21,126,127,128, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_15_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 23, 10, 2, 22, 15, 16, 1, 17, 8, 5, 12, 7, 9, /* 4X */ + 14, 28, 11, 3, 4, 6, 13, 27, 26, 25, 30,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 23, 10, 2, 22, 15, 16, 1, 17, 8, 5, 12, 7, 9, /* 6X */ + 14, 28, 11, 3, 4, 6, 13, 27, 26, 25, 30,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM, 29,SYM, 29,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM, 32, 50,SYM,SYM, 32,SYM,SYM,SYM,129,130,131,SYM, /* BX */ + 40, 43,132,133, 18, 44, 47, 48, 41, 33,134,135, 35, 36,136,137, /* CX */ + 46,138, 53, 42,139, 20, 24,SYM, 38, 54, 52,140, 21,141,142,143, /* DX */ + 40, 43,144,145, 18, 44, 47, 48, 41, 33,146,147, 35, 36,148,149, /* EX */ + 46,150, 53, 42,151, 20, 24,SYM, 38, 54, 52,152, 21,153,154,155, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_13_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 23, 10, 2, 22, 15, 16, 1, 17, 8, 5, 12, 7, 9, /* 4X */ + 14, 28, 11, 3, 4, 6, 13, 27, 26, 25, 30,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 23, 10, 2, 22, 15, 16, 1, 17, 8, 5, 12, 7, 9, /* 6X */ + 14, 28, 11, 3, 4, 6, 13, 27, 26, 25, 30,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 38,SYM,156,SYM,SYM,SYM,SYM, 47, /* AX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 38,SYM,157,SYM,SYM,SYM,SYM, 47, /* BX */ + 158,159, 37,160, 18, 44,161, 45,162, 33,163,164,165,166, 39,167, /* CX */ + 29,168,169, 42, 31, 20, 24,SYM,170, 51,171, 34, 21, 49, 32,172, /* DX */ + 173,174, 37,175, 18, 44,176, 45,177, 33,178,179,180,181, 39,182, /* EX */ + 29,183,184, 42, 31, 20, 24,SYM,185, 51,186, 34, 21, 49, 32,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Windows_1257_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 23, 10, 2, 22, 15, 16, 1, 17, 8, 5, 12, 7, 9, /* 4X */ + 14, 28, 11, 3, 4, 6, 13, 27, 26, 25, 30,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 23, 10, 2, 22, 15, 16, 1, 17, 8, 5, 12, 7, 9, /* 6X */ + 14, 28, 11, 3, 4, 6, 13, 27, 26, 25, 30,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM,ILL,SYM,SYM,SYM,SYM,ILL,SYM,ILL,SYM,ILL,SYM,SYM,SYM, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,ILL,SYM,ILL,SYM,SYM,ILL, /* 9X */ + SYM,ILL,SYM,SYM,SYM,ILL,SYM,SYM, 38,SYM,187,SYM,SYM,SYM,SYM, 47, /* AX */ + SYM,SYM,SYM,SYM,SYM, 50,SYM,SYM, 38,SYM,188,SYM,SYM,SYM,SYM, 47, /* BX */ + 189,190, 37,191, 18, 44,192, 45,193, 33,194,195,196,197, 39,198, /* CX */ + 29,199,200, 42, 31, 20, 24,SYM,201, 51,202, 34, 21, 49, 32,203, /* DX */ + 204,205, 37,206, 18, 44,207, 45,208, 33,209,210,211,212, 39,213, /* EX */ + 29,214,215, 42, 31, 20, 24,SYM,216, 51,217, 34, 21, 49, 32,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 853 + * First 512 sequences: 0.9972721312183132 + * Next 512 sequences (512-1024): 0.0027278687816868537 + * Rest: -5.204170427930421e-18 + * Negative sequences: TODO + */ +static const PRUint8 EstonianLangModel = +{ + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,3,3,0,3,3,3,3,2,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,2,3,3,2,2,3,3,2,2,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,3,3,0,3,3,3,2,0,2,0,2, + 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,2,2,0,2,2,0, + 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,3,3,3,0,3,3,2,3,3,3,2,2,0,3,2,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,0,0,0,2,2,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,3,3,0,0,2,2,2,2,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,2,2,3,2,3,3,3,3,2,3,3,3,3,2,3,3,0,2,2,2,0,2, + 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,0,3,3,2,2,3,3,0,2,0,0,0,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,0,2,3,3,0,3,3,3,2,2,2,0,0, + 3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,2,2,3,0,2,0,3,0,0,0,2,2,2,0,0,0,3,3, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,2,0,2,0,0,0, + 3,3,3,3,3,3,3,3,3,3,2,2,3,2,3,3,3,2,3,3,3,3,3,2,3,3,0,2,0,2,2,0,0, + 3,3,3,3,2,3,3,3,3,3,2,2,2,2,2,2,2,2,3,0,3,2,0,2,3,2,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,2,3,2,3,0,3,3,0,2,3,3,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,3,3,3,0,2,2,2,2,2,0,3,2,0,2,0,2,0,0, + 3,3,3,3,3,3,3,3,3,3,0,3,3,3,0,0,3,3,3,0,3,3,3,2,0,3,0,2,0,0,0,2,0, + 3,3,3,2,3,0,3,3,0,3,0,2,3,0,3,0,0,0,3,0,3,3,0,0,2,0,0,0,0,0,0,0,0, + 2,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,3,0,0,0,0,0,0,2,0,0,0,0,0,0, + 3,3,3,3,2,3,3,3,2,3,0,3,2,0,0,0,2,3,0,2,0,2,0,2,0,2,2,0,0,0,0,0,0, + 0,3,3,3,3,3,3,3,2,0,3,3,3,3,3,3,3,3,0,3,3,0,0,0,0,0,0,0,0,0,2,0,0,
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/src/LangModels/LangFinnishModel.cpp
Added
@@ -0,0 +1,291 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Finnish *********/ + +/** + * Generated by BuildLangModel.py + * On: 2016-09-21 18:15:05.189948 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Iso_8859_15_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 4X */ + 16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 6X */ + 16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM, 27,SYM, 27,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM, 28, 61,SYM,SYM, 28,SYM,SYM,SYM, 62, 63, 64,SYM, /* BX */ + 49, 35, 65, 46, 11, 56, 39, 37, 40, 30, 51, 31, 66, 36, 67, 57, /* CX */ + 68, 58, 52, 33, 34, 59, 22,SYM, 69, 70, 38, 71, 32, 72, 73, 55, /* DX */ + 49, 35, 74, 46, 11, 56, 39, 37, 40, 30, 51, 31, 75, 36, 76, 57, /* EX */ + 77, 58, 52, 33, 34, 59, 22,SYM, 78, 79, 38, 80, 32, 81, 82, 83, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Windows_1252_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 4X */ + 16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 6X */ + 16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM, 84,SYM,SYM,SYM,SYM,SYM,SYM, 27,SYM, 85,ILL, 28,ILL, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 27,SYM, 86,ILL, 28, 87, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 88,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 49, 35, 89, 46, 11, 56, 39, 37, 40, 30, 51, 31, 90, 36, 91, 57, /* CX */ + 92, 58, 52, 33, 34, 59, 22,SYM, 93, 94, 38, 95, 32, 96, 97, 55, /* DX */ + 49, 35, 98, 46, 11, 56, 39, 37, 40, 30, 51, 31, 99, 36,100, 57, /* EX */ + 101, 58, 52, 33, 34, 59, 22,SYM,102,103, 38,104, 32,105,106,107, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_4_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 4X */ + 16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 6X */ + 16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,108,109, 47,SYM,110,111,SYM,SYM, 27,112,113,114,SYM, 28,SYM, /* AX */ + SYM,115,SYM, 47,SYM,116,117,SYM,SYM, 27,118,119,120, 45, 28, 45, /* BX */ + 53, 35,121, 46, 11, 56, 39,122, 43, 30,123, 31,124, 36,125,126, /* CX */ + 127, 54,128,129, 34, 59, 22,SYM,130,131, 38,132, 32,133,134, 55, /* DX */ + 53, 35,135, 46, 11, 56, 39,136, 43, 30,137, 31,138, 36,139,140, /* EX */ + 141, 54,142,143, 34, 59, 22,SYM,144,145, 38,146, 32,147,148,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_13_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 4X */ + 16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 6X */ + 16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,149,SYM, 47,SYM,SYM,SYM,SYM, 39, /* AX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,150,SYM, 47,SYM,SYM,SYM,SYM, 39, /* BX */ + 151,152, 53, 41, 11, 56,153,154, 43, 30,155,156,157,158,159,160, /* CX */ + 27,161, 54, 33,162, 59, 22,SYM,163,164,165,166, 32, 60, 28, 55, /* DX */ + 167,168, 53, 41, 11, 56,169,170, 43, 30,171,172,173,174,175,176, /* EX */ + 27,177, 54, 33,178, 59, 22,SYM,179,180,181,182, 32, 60, 28,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_9_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 4X */ + 16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 6X */ + 16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM,183,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 49, 35,184, 46, 11, 56, 39, 37, 40, 30, 51, 31,185, 36,186, 57, /* CX */ + 50, 58, 52, 33, 34, 59, 22,SYM,187,188, 38,189, 32, 48, 42, 55, /* DX */ + 49, 35,190, 46, 11, 56, 39, 37, 40, 30, 51, 31,191, 36,192, 57, /* EX */ + 50, 58, 52, 33, 34, 59, 22,SYM,193,194, 38,195, 32, 44, 42,196, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_1_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 4X */ + 16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 19, 21, 18, 4, 23, 20, 14, 1, 15, 9, 6, 12, 2, 7, /* 6X */ + 16, 29, 10, 5, 3, 8, 13, 24, 26, 17, 25,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM,197,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 49, 35,198, 46, 11, 56, 39, 37, 40, 30, 51, 31,199, 36,200, 57, /* CX */ + 201, 58, 52, 33, 34, 59, 22,SYM,202,203, 38,204, 32,205,206, 55, /* DX */ + 49, 35,207, 46, 11, 56, 39, 37, 40, 30, 51, 31,208, 36,209, 57, /* EX */ + 210, 58, 52, 33, 34, 59, 22,SYM,211,212, 38,213, 32,214,215,216, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 919 + * First 512 sequences: 0.9985378147555799 + * Next 512 sequences (512-1024): 0.0014621852444200612 + * Rest: 3.881443777498106e-17 + * Negative sequences: TODO + */ +static const PRUint8 FinnishLangModel = +{
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/LangModels/LangFrenchModel.cpp -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/LangModels/LangFrenchModel.cpp
Changed
@@ -203,4 +203,4 @@ (float)0.997057879992383, PR_TRUE, "ISO-8859-15" -}; \ No newline at end of file +};
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/LangModels/LangGermanModel.cpp -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/LangModels/LangGermanModel.cpp
Changed
@@ -165,4 +165,4 @@ (float)0.9934041448127945, PR_TRUE, "ISO-8859-1" -}; \ No newline at end of file +};
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/LangModels/LangHungarianModel.cpp -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/LangModels/LangHungarianModel.cpp
Changed
@@ -166,4 +166,4 @@ (float)0.9748272224933486, PR_FALSE, "WINDOWS-1250" -}; \ No newline at end of file +};
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/src/LangModels/LangIrishModel.cpp
Added
@@ -0,0 +1,230 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Irish *********/ + +/** + * Generated by BuildLangModel.py + * On: 2016-09-27 00:33:40.158624 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Iso_8859_1_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 4X */ + 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 6X */ + 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 44,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 45, 14, 46, 47, 33, 48, 49, 39, 35, 18, 42, 37, 50, 17, 51, 40, /* CX */ + 52, 32, 43, 22, 53, 54, 38,SYM, 36, 55, 20, 56, 31, 57, 58, 59, /* DX */ + 60, 14, 61, 62, 33, 63, 64, 39, 35, 18, 42, 37, 65, 17, 66, 40, /* EX */ + 67, 32, 43, 22, 68, 69, 38,SYM, 36, 70, 20, 71, 31, 72, 73, 74, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Windows_1252_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 4X */ + 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 6X */ + 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM, 75,SYM,SYM,SYM,SYM,SYM,SYM, 34,SYM, 76,ILL, 77,ILL, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 34,SYM, 78,ILL, 79, 80, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 81,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 82, 14, 83, 84, 33, 85, 86, 39, 35, 18, 42, 37, 87, 17, 88, 40, /* CX */ + 89, 32, 43, 22, 90, 91, 38,SYM, 36, 92, 20, 93, 31, 94, 95, 96, /* DX */ + 97, 14, 98, 99, 33,100,101, 39, 35, 18, 42, 37,102, 17,103, 40, /* EX */ + 104, 32, 43, 22,105,106, 38,SYM, 36,107, 20,108, 31,109,110,111, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_15_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 4X */ + 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 6X */ + 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM, 34,SYM, 34,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,112,113,SYM,SYM,114,SYM,SYM,SYM,115,116,117,SYM, /* BX */ + 118, 14,119,120, 33,121,122, 39, 35, 18, 42, 37,123, 17,124, 40, /* CX */ + 125, 32, 43, 22,126,127, 38,SYM, 36,128, 20,129, 31,130,131,132, /* DX */ + 133, 14,134,135, 33,136,137, 39, 35, 18, 42, 37,138, 17,139, 40, /* EX */ + 140, 32, 43, 22,141,142, 38,SYM, 36,143, 20,144, 31,145,146,147, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_9_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 4X */ + 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 6X */ + 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM,148,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 149, 14,150,151, 33,152,153, 39, 35, 18, 42, 37,154, 17,155, 40, /* CX */ + 156, 32, 43, 22,157,158, 38,SYM, 36,159, 20,160, 31,161,162,163, /* DX */ + 164, 14,165,166, 33,167,168, 39, 35, 18, 42, 37,169, 17,170, 40, /* EX */ + 171, 32, 43, 22,172,173, 38,SYM, 36,174, 20,175, 31, 41,176,177, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 701 + * First 512 sequences: 0.9974076651249096 + * Next 512 sequences (512-1024): 0.0025923348750903907 + * Rest: -2.7755575615628914e-17 + * Negative sequences: TODO + */ +static const PRUint8 IrishLangModel = +{ + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,0,3,0,3,0,3,3,3,3,2,3,3,2, + 3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,0,2,3,3,3,3,3,3,3,0,3,3,3,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,0,2,3,0,2, + 3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,2,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,2,3,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0,3,0,3,3,3,3,3,3,2,3,3,0, + 3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,3,3,3,3,3,3,3,2,3,3,3,0,3,0,0, + 3,3,3,3,3,3,3,3,3,3,3,0,2,3,3,3,3,3,3,2,3,0,3,3,3,3,2,2,3,0,0, + 3,3,3,3,3,3,3,3,3,3,3,2,0,3,3,3,2,3,3,3,3,2,3,0,3,3,2,0,3,0,2, + 3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,3,0,0, + 2,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,0,3,3,3,0,3,3,3,3,3,2,3,2, + 3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,2,3,3,3,3,0,3,2,3,2,3,2,2,0,0, + 3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,2,3,3,2,3,0,3,0,2,0,2,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,2,0,3,3,3,3,3,3,2,3,3,3,0,3,0,0,0,2,2,0, + 0,3,3,0,3,2,3,3,3,3,0,3,3,3,0,0,3,3,0,3,0,3,0,2,0,0,0,0,2,0,0, + 3,3,3,2,3,3,3,3,3,3,2,3,3,3,3,0,3,3,2,2,0,3,0,2,2,2,0,2,3,2,0, + 3,3,3,3,3,3,3,2,2,3,3,2,0,0,3,3,3,3,3,2,3,3,3,0,2,0,0,2,0,0,0, + 2,0,3,0,3,0,3,3,3,3,3,3,3,2,0,0,3,0,0,0,3,0,0,2,0,0,0,0,0,0,0, + 3,3,3,0,2,2,3,3,0,2,3,2,0,2,0,0,2,0,0,2,2,2,0,2,0,0,0,0,0,0,0, + 3,3,0,3,3,3,2,3,2,3,3,0,3,2,3,3,2,3,3,3,0,0,3,2,2,0,0,0,0,0,0, + 2,3,3,0,3,0,3,3,3,3,0,3,2,2,0,0,3,0,0,0,0,3,0,0,0,0,0,0,0,0,0, + 3,3,0,3,3,3,3,3,2,3,3,0,0,2,3,3,0,3,3,0,2,3,3,0,2,0,0,0,0,0,0, + 0,3,3,0,3,0,3,3,3,3,0,3,3,3,0,0,3,0,0,2,3,3,0,2,0,0,0,0,2,0,0, + 3,3,2,0,3,3,3,2,0,2,3,0,2,0,3,2,0,3,3,0,0,0,3,2,2,0,0,0,0,0,0, + 3,0,3,0,2,3,3,2,3,3,3,2,0,3,0,3,2,0,0,2,0,0,0,0,2,0,3,0,0,0,0, + 3,3,3,3,3,3,3,0,0,3,3,0,0,2,2,3,2,0,2,0,0,2,0,2,3,2,2,0,0,0,0, + 3,3,3,3,3,3,3,2,0,2,3,2,0,2,0,2,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0, + 3,3,2,0,2,3,0,0,0,0,3,0,0,0,0,3,0,0,0,0,0,0,2,0,0,2,0,0,0,0,0, + 3,3,2,3,0,3,2,0,0,0,3,2,2,2,0,2,2,0,0,0,0,0,0,0,2,0,0,0,2,0,2, + 3,3,0,0,0,2,0,0,0,0,2,0,0,0,0,0,0,0,0,3,0,3,0,2,2,0,0,0,0,0,0, + 2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +}; + + +const SequenceModel Iso_8859_1IrishModel = +{ + Iso_8859_1_CharToOrderMap, + IrishLangModel, + 31, + (float)0.9974076651249096, + PR_TRUE, + "ISO-8859-1"
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/src/LangModels/LangItalianModel.cpp
Added
@@ -0,0 +1,264 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Italian *********/ + +/** + * Generated by BuildLangModel.py + * On: 2016-09-21 18:46:08.841217 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Iso_8859_3_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 16, 9, 10, 2, 17, 14, 19, 0, 27, 21, 5, 12, 4, 3, /* 4X */ + 13, 20, 6, 8, 7, 11, 15, 25, 26, 23, 18,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 16, 9, 10, 2, 17, 14, 19, 0, 27, 21, 5, 12, 4, 3, /* 6X */ + 13, 20, 6, 8, 7, 11, 15, 25, 26, 23, 18,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 59,SYM,SYM,SYM,ILL, 60,SYM,SYM, 61, 48, 47, 62,SYM,ILL, 58, /* AX */ + SYM, 63,SYM,SYM,SYM,SYM, 64,SYM,SYM, 46, 48, 47, 65,SYM,ILL, 58, /* BX */ + 22, 32, 50,ILL, 39, 66, 67, 38, 24, 30, 55, 40, 31, 37, 42, 49, /* CX */ + ILL, 44, 29, 33, 51, 68, 34,SYM, 69, 28, 45, 70, 36, 71, 72, 73, /* DX */ + 22, 32, 50,ILL, 39, 74, 75, 38, 24, 30, 55, 40, 31, 37, 42, 49, /* EX */ + ILL, 44, 29, 33, 51, 76, 34,SYM, 77, 28, 45, 78, 36, 79, 80,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_15_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 16, 9, 10, 2, 17, 14, 19, 0, 27, 21, 5, 12, 4, 3, /* 4X */ + 13, 20, 6, 8, 7, 11, 15, 25, 26, 23, 18,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 16, 9, 10, 2, 17, 14, 19, 0, 27, 21, 5, 12, 4, 3, /* 6X */ + 13, 20, 6, 8, 7, 11, 15, 25, 26, 23, 18,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM, 35,SYM, 35,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM, 41, 81,SYM,SYM, 41,SYM,SYM,SYM, 52, 52, 82,SYM, /* BX */ + 22, 32, 50, 43, 39, 53, 54, 38, 24, 30, 55, 40, 31, 37, 42, 49, /* CX */ + 56, 44, 29, 33, 51, 83, 34,SYM, 57, 28, 45, 84, 36, 85, 86, 87, /* DX */ + 22, 32, 50, 43, 39, 53, 54, 38, 24, 30, 55, 40, 31, 37, 42, 49, /* EX */ + 56, 44, 29, 33, 51, 88, 34,SYM, 57, 28, 45, 89, 36, 90, 91, 92, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_9_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 16, 9, 10, 2, 17, 14, 19, 0, 27, 21, 5, 12, 4, 3, /* 4X */ + 13, 20, 6, 8, 7, 11, 15, 25, 26, 23, 18,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 16, 9, 10, 2, 17, 14, 19, 0, 27, 21, 5, 12, 4, 3, /* 6X */ + 13, 20, 6, 8, 7, 11, 15, 25, 26, 23, 18,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 93,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 22, 32, 50, 43, 39, 53, 54, 38, 24, 30, 55, 40, 31, 37, 42, 49, /* CX */ + 47, 44, 29, 33, 51, 94, 34,SYM, 57, 28, 45, 95, 36, 96, 48, 97, /* DX */ + 22, 32, 50, 43, 39, 53, 54, 38, 24, 30, 55, 40, 31, 37, 42, 49, /* EX */ + 47, 44, 29, 33, 51, 98, 34,SYM, 57, 28, 45, 99, 36, 46, 48,100, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_1_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 16, 9, 10, 2, 17, 14, 19, 0, 27, 21, 5, 12, 4, 3, /* 4X */ + 13, 20, 6, 8, 7, 11, 15, 25, 26, 23, 18,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 16, 9, 10, 2, 17, 14, 19, 0, 27, 21, 5, 12, 4, 3, /* 6X */ + 13, 20, 6, 8, 7, 11, 15, 25, 26, 23, 18,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM,101,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 22, 32, 50, 43, 39, 53, 54, 38, 24, 30, 55, 40, 31, 37, 42, 49, /* CX */ + 56, 44, 29, 33, 51,102, 34,SYM, 57, 28, 45,103, 36,104,105,106, /* DX */ + 22, 32, 50, 43, 39, 53, 54, 38, 24, 30, 55, 40, 31, 37, 42, 49, /* EX */ + 56, 44, 29, 33, 51,107, 34,SYM, 57, 28, 45,108, 36,109,110,111, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Windows_1252_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 16, 9, 10, 2, 17, 14, 19, 0, 27, 21, 5, 12, 4, 3, /* 4X */ + 13, 20, 6, 8, 7, 11, 15, 25, 26, 23, 18,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 16, 9, 10, 2, 17, 14, 19, 0, 27, 21, 5, 12, 4, 3, /* 6X */ + 13, 20, 6, 8, 7, 11, 15, 25, 26, 23, 18,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM,112,SYM,SYM,SYM,SYM,SYM,SYM, 35,SYM, 52,ILL, 41,ILL, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 35,SYM, 52,ILL, 41,113, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM,114,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 22, 32, 50, 43, 39, 53, 54, 38, 24, 30, 55, 40, 31, 37, 42, 49, /* CX */ + 56, 44, 29, 33, 51,115, 34,SYM, 57, 28, 45,116, 36,117,118,119, /* DX */ + 22, 32, 50, 43, 39, 53, 54, 38, 24, 30, 55, 40, 31, 37, 42, 49, /* EX */ + 56, 44, 29, 33, 51,120, 34,SYM, 57, 28, 45,121, 36,122,123,124, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 872 + * First 512 sequences: 0.9989484485502651 + * Next 512 sequences (512-1024): 0.0010515514497349433 + * Rest: -4.336808689942018e-17 + * Negative sequences: TODO + */ +static const PRUint8 ItalianLangModel = +{ + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,2,0,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,0,3,3,3,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,0,3,3,3,0,2,0,0,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,2,3,2,3,0,3,3,2,2,0, + 3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,2,2,3,2,0,3,3,3,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,2, + 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,2,3,3,0,2,3,3,2,3,2,2,3,3,3,3,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,3,0,0,3,2,3,3,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,2,3,3,3,3,3,0,3,0,0,3,2,0,3,2,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,0,2,3,3,2,3,2,3,2,2,3,3,2,2, + 3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,0,3,0,3,2,3,3,3,0,3,2,3,0,0, + 3,3,3,3,3,3,2,3,3,3,3,3,3,3,2,3,3,0,0,2,0,0,0,3,0,2,3,0,0,3,2,2,2,2, + 3,3,3,3,2,3,3,3,3,3,3,3,3,3,2,2,2,2,2,3,0,3,2,3,0,2,0,2,0,3,2,0,2,2, + 3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,2,3,2,0,3,2,2,0,3,0,2,2,2,0,2,2,0,0,2, + 3,3,3,3,2,3,3,0,2,2,2,3,2,2,2,3,2,0,0,2,0,2,2,3,2,0,0,0,0,2,2,2,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,3,0,2,3,0,2,0,3,0,3,0,2,2,2,2,3,2,0, + 3,3,3,3,0,3,3,3,2,3,0,3,2,2,3,2,2,3,0,2,0,2,0,0,2,2,2,2,2,0,2,0,0,0, + 3,3,3,3,3,2,2,2,2,0,2,3,0,2,3,0,3,2,3,3,0,3,0,3,0,2,0,2,0,3,2,0,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,0,2,0,2,0,3,0,3,0,3,0,2,0,0,3,0,3,0, + 2,3,0,2,0,0,2,0,2,0,0,3,0,0,0,0,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/src/LangModels/LangLatvianModel.cpp
Added
@@ -0,0 +1,207 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Latvian *********/ + +/** + * Generated by BuildLangModel.py + * On: 2016-09-21 00:19:18.362275 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Iso_8859_4_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 17, 22, 13, 3, 25, 19, 28, 1, 16, 11, 9, 12, 7, 10, /* 4X */ + 15, 38, 4, 2, 5, 6, 14, 33, 35, 34, 20,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 17, 22, 13, 3, 25, 19, 28, 1, 16, 11, 9, 12, 7, 10, /* 6X */ + 15, 38, 4, 2, 5, 6, 14, 33, 35, 34, 20,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 55, 56, 57,SYM, 58, 26,SYM,SYM, 23, 21, 31, 59,SYM, 29,SYM, /* AX */ + SYM, 60,SYM, 61,SYM, 62, 26,SYM,SYM, 23, 21, 31, 63, 48, 29, 48, /* BX */ + 8, 42, 64, 65, 40, 52, 53, 66, 32, 37, 67, 43, 46, 45, 49, 18, /* CX */ + 68, 24, 51, 30, 69, 70, 36,SYM, 71, 72, 73, 74, 39, 75, 27, 44, /* DX */ + 8, 42, 76, 77, 40, 52, 53, 78, 32, 37, 79, 43, 46, 45, 49, 18, /* EX */ + 80, 24, 51, 30, 81, 82, 36,SYM, 83, 84, 85, 86, 39, 87, 27,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_10_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 17, 22, 13, 3, 25, 19, 28, 1, 16, 11, 9, 12, 7, 10, /* 4X */ + 15, 38, 4, 2, 5, 6, 14, 33, 35, 34, 20,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 17, 22, 13, 3, 25, 19, 28, 1, 16, 11, 9, 12, 7, 10, /* 6X */ + 15, 38, 4, 2, 5, 6, 14, 33, 35, 34, 20,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 88, 21, 31, 18, 89, 30,SYM, 26, 90, 23, 91, 29,SYM, 27, 48, /* AX */ + SYM, 92, 21, 31, 18, 93, 30,SYM, 26, 94, 23, 95, 29, 96, 27, 48, /* BX */ + 8, 42, 97, 98, 40, 52, 53, 99, 32, 37,100, 43, 46, 45, 49,101, /* CX */ + 50, 24, 51, 47,102,103, 36,104,105,106,107,108, 39,109, 54, 44, /* DX */ + 8, 42,110,111, 40, 52, 53,112, 32, 37,113, 43, 46, 45, 49,114, /* EX */ + 50, 24, 51, 47,115,116, 36,117,118,119,120,121, 39,122, 54,123, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_13_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 17, 22, 13, 3, 25, 19, 28, 1, 16, 11, 9, 12, 7, 10, /* 4X */ + 15, 38, 4, 2, 5, 6, 14, 33, 35, 34, 20,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 17, 22, 13, 3, 25, 19, 28, 1, 16, 11, 9, 12, 7, 10, /* 6X */ + 15, 38, 4, 2, 5, 6, 14, 33, 35, 34, 20,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,124,SYM,125,SYM,SYM,SYM,SYM, 53, /* AX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,126,SYM,127,SYM,SYM,SYM,SYM, 53, /* BX */ + 128,129, 8,130, 40, 52,131, 21, 32, 37,132, 46, 31, 30, 18, 26, /* CX */ + 23,133, 24, 47, 51,134, 36,SYM,135, 41,136, 27, 39,137, 29, 44, /* DX */ + 138,139, 8,140, 40, 52,141, 21, 32, 37,142, 46, 31, 30, 18, 26, /* EX */ + 23,143, 24, 47, 51,144, 36,SYM,145, 41,146, 27, 39,147, 29,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 970 + * First 512 sequences: 0.9904102202220861 + * Next 512 sequences (512-1024): 0.009589779777913882 + * Rest: -1.734723475976807e-17 + * Negative sequences: TODO + */ +static const PRUint8 LatvianLangModel = +{ + 2,3,3,3,3,3,3,3,2,3,2,3,3,3,3,3,3,3,2,3,3,2,3,3,3,3,3,0,3,3,2,2,3,2,2,2,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,2,3,3,3,3,3,2,3,3,3,2,3,0,0,2,0,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,2,3,3,0,2,2,2,3,2,2,0,0,0,2,2,0,2,2,2, + 3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,3,3,3,3,0,3,3,2,3,2,2,2,2,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,2,3,2,2,2,2,0,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,3,3,0,0,2,0,2,2,0,0,0,0, + 3,3,3,2,3,3,2,3,3,3,2,3,3,3,3,3,3,3,2,3,3,2,3,3,3,2,3,0,2,2,2,2,2,0,2,0,0,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,2,3,3,3,2,3,3,3,0,3,0,2,2,2,0,0,3,0,2,0,0,0,2, + 2,2,3,2,3,3,2,3,0,3,0,3,3,3,3,3,3,3,0,2,3,0,3,3,3,3,3,0,0,2,0,2,2,0,0,0,0,0,0, + 3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,3,3,2,0,2,2,0,2,2,0,2,0, + 3,2,3,2,3,3,3,3,2,3,2,3,3,3,3,3,3,3,0,3,3,2,3,3,3,3,3,0,2,3,2,3,2,2,2,2,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,2,2,2,2,2,2,2,3,3,2,0,3,2,2,0,0,0,0,0,2,0,2,0,0, + 3,3,3,3,0,3,3,3,3,2,3,3,2,2,2,3,3,3,3,2,0,3,2,2,0,2,0,3,0,0,0,2,0,0,2,2,0,2,0, + 3,3,3,3,3,2,3,3,3,2,3,2,3,2,3,2,2,2,3,2,3,3,2,2,2,0,0,2,0,3,0,0,0,2,2,0,0,2,0, + 3,3,3,3,2,2,3,2,3,2,3,2,2,2,2,3,3,2,3,2,2,3,2,0,2,0,0,2,0,0,0,0,0,0,2,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,3,3,2,3,2,3,2,2,2,2,2,0,0,2,0,0,0,0,0,0,0, + 3,3,3,3,2,0,3,3,3,2,3,2,2,2,2,2,0,0,2,2,0,3,2,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,0,3,2,3,3,3,2,2,2,2,2,2,2,3,0,0,3,2,2,0,0,2,3,2,0,0,0,2,0,2,0,2,0,0, + 0,0,3,0,3,3,0,3,0,3,0,3,3,3,3,3,3,3,0,3,3,0,3,3,3,2,2,0,0,2,2,0,2,0,0,0,0,0,0, + 3,3,3,3,3,3,3,2,3,3,3,2,2,2,2,0,0,0,0,2,2,2,0,3,0,2,3,3,2,2,0,0,0,0,2,0,0,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,3,2,3,0,0,2,2,0,0,0,0,2,2,0,0,0,0, + 2,0,3,0,3,3,0,3,0,3,0,3,3,3,3,2,3,2,0,3,3,0,3,3,2,2,3,0,0,2,2,3,0,0,0,0,0,0,0, + 3,3,3,3,2,2,3,2,3,2,3,3,2,2,2,2,0,2,3,0,2,3,2,2,0,0,0,2,3,0,0,2,0,0,2,0,0,0,0, + 3,3,3,3,2,3,3,3,3,3,3,2,2,2,3,2,2,2,3,2,2,2,0,0,2,0,2,2,0,0,3,0,0,0,0,0,0,0,0, + 3,3,2,3,0,0,3,2,3,0,3,0,2,2,2,2,2,2,0,2,0,3,2,3,0,0,0,2,0,0,3,2,0,0,0,0,0,0,0, + 3,3,3,3,3,2,3,2,2,3,3,2,2,0,0,0,0,0,2,2,0,2,0,0,0,2,0,2,0,0,0,0,0,0,0,0,0,0,0, + 3,3,2,2,2,0,3,2,3,2,3,2,2,0,2,2,2,0,2,2,0,2,0,2,2,0,2,2,0,0,2,3,0,0,0,0,0,0,0, + 0,2,3,0,3,3,0,3,0,3,2,3,2,3,3,3,2,0,0,2,3,0,3,2,0,2,0,0,2,2,0,0,0,0,0,0,0,0,0, + 3,3,2,3,2,2,2,3,2,2,3,2,2,2,0,0,2,0,2,0,0,2,0,0,0,0,0,2,2,0,0,0,0,2,2,0,2,0,0, + 3,3,2,3,2,0,3,2,3,2,3,2,2,0,2,0,0,0,2,0,2,2,0,0,2,0,0,2,0,0,2,2,0,0,0,0,0,0,0, + 3,3,2,3,0,2,3,0,2,0,2,0,0,0,0,0,0,0,3,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,3,0,3,0,0,2,0,0,0,2,0,0,0,0,0,0,0,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,2,2,3,0,0,3,2,2,0,2,2,2,0,0,2,0,0,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0, + 2,2,2,2,0,2,0,0,0,2,2,0,2,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,3,0,0,0,0,0, + 2,0,2,2,2,0,0,2,0,2,2,0,2,2,0,0,0,0,0,2,0,0,2,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,0, + 2,2,0,0,0,0,2,0,0,0,0,0,0,0,2,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0, + 0,0,2,0,0,2,0,2,0,2,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,2,0,0,0,0,0,0,0,0,0,0, + 0,0,2,0,2,2,0,2,0,0,2,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +}; + + +const SequenceModel Iso_8859_4LatvianModel = +{ + Iso_8859_4_CharToOrderMap, + LatvianLangModel, + 39, + (float)0.9904102202220861, + PR_TRUE, + "ISO-8859-4" +}; + +const SequenceModel Iso_8859_10LatvianModel = +{ + Iso_8859_10_CharToOrderMap, + LatvianLangModel, + 39, + (float)0.9904102202220861, + PR_TRUE, + "ISO-8859-10" +}; + +const SequenceModel Iso_8859_13LatvianModel =
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/src/LangModels/LangLithuanianModel.cpp
Added
@@ -0,0 +1,206 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Lithuanian *********/ + +/** + * Generated by BuildLangModel.py + * On: 2016-09-21 00:25:34.775158 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Iso_8859_10_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 18, 23, 12, 4, 25, 16, 26, 0, 14, 9, 10, 11, 6, 3, /* 4X */ + 15, 37, 5, 2, 7, 8, 13, 33, 32, 19, 27,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 18, 23, 12, 4, 25, 16, 26, 0, 14, 9, 10, 11, 6, 3, /* 6X */ + 15, 37, 5, 2, 7, 8, 13, 33, 32, 19, 27,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 29, 50, 60, 47, 61, 62,SYM, 56, 55, 21, 63, 22,SYM, 28, 64, /* AX */ + SYM, 29, 50, 65, 47, 66, 67,SYM, 56, 55, 21, 68, 22, 69, 28, 70, /* BX */ + 41, 39, 71, 53, 38, 43, 72, 30, 24, 36, 31, 73, 17, 40, 74, 46, /* CX */ + 75, 57, 34, 44, 59, 76, 35, 77, 48, 20, 54, 78, 45, 79, 80, 52, /* DX */ + 41, 39, 81, 53, 38, 43, 82, 30, 24, 36, 31, 83, 17, 40, 84, 46, /* EX */ + 85, 57, 34, 44, 59, 86, 35, 87, 48, 20, 54, 88, 45, 89, 90, 91, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_4_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 18, 23, 12, 4, 25, 16, 26, 0, 14, 9, 10, 11, 6, 3, /* 4X */ + 15, 37, 5, 2, 7, 8, 13, 33, 32, 19, 27,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 18, 23, 12, 4, 25, 16, 26, 0, 14, 9, 10, 11, 6, 3, /* 6X */ + 15, 37, 5, 2, 7, 8, 13, 33, 32, 19, 27,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 29, 92, 93,SYM, 94, 56,SYM,SYM, 21, 50, 95, 96,SYM, 22,SYM, /* AX */ + SYM, 29,SYM, 97,SYM, 98, 56,SYM,SYM, 21, 50, 99,100,101, 22,102, /* BX */ + 41, 39,103, 53, 38, 43,104, 30, 24, 36, 31,105, 17, 40,106, 47, /* CX */ + 55, 57, 34,107, 59,108, 35,SYM, 48, 20, 54,109, 45,110, 28, 52, /* DX */ + 41, 39,111, 53, 38, 43,112, 30, 24, 36, 31,113, 17, 40,114, 47, /* EX */ + 55, 57, 34,115, 59,116, 35,SYM, 48, 20, 54,117, 45,118, 28,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_13_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 18, 23, 12, 4, 25, 16, 26, 0, 14, 9, 10, 11, 6, 3, /* 4X */ + 15, 37, 5, 2, 7, 8, 13, 33, 32, 19, 27,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 18, 23, 12, 4, 25, 16, 26, 0, 14, 9, 10, 11, 6, 3, /* 6X */ + 15, 37, 5, 2, 7, 8, 13, 33, 32, 19, 27,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 48,SYM,119,SYM,SYM,SYM,SYM,120, /* AX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 48,SYM,121,SYM,SYM,SYM,SYM,122, /* BX */ + 29, 30, 41, 49, 38, 43, 31, 50, 24, 36,123, 17,124,125, 47, 56, /* CX */ + 21, 51, 57, 44, 34,126, 35,SYM, 20, 42, 58, 28, 45,127, 22, 52, /* DX */ + 29, 30, 41, 49, 38, 43, 31, 50, 24, 36,128, 17,129,130, 47, 56, /* EX */ + 21, 51, 57, 44, 34,131, 35,SYM, 20, 42, 58, 28, 45,132, 22,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 1016 + * First 512 sequences: 0.9928710196247589 + * Next 512 sequences (512-1024): 0.0071289803752411715 + * Rest: -4.85722573273506e-17 + * Negative sequences: TODO + */ +static const PRUint8 LithuanianLangModel = +{ + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,3,3,3,3,3,3,3,3,3,0,2,3,2,2,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,0,3,3,3,3,3,3,3,0,0,0,0,2,2,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,0,3,3,2,3,2,3,3,2,3,0,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,0,3,3,3,2,3,3,3,0,0,0,0,2,3,0,0,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,0,3,3,3,3,3,2,3,0,0,2,0,2,3,0,0,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,2,2,3,3,3,3,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,2,2,3,3,2,2,3,3,3,3,3,2,3,3,3,3,3,3,2,3,3,3,2,2,2,0,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,3,3,3,3,2,0,2,0,2,3,2,3,3,3,3,0,2,2,2,2,0, + 3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,0,3,2,0,3,3,3,3,3,2,3,0,0,0,0,0,2,0,0,0,0, + 3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,2,2,3,2,3,3,3,0,3,2,2,3,2,3,3,2,3,0,2,2,0,2,0, + 3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,3,3,2,3,3,3,3,0,2,0,2,2,0, + 3,3,3,3,3,2,2,3,3,2,2,3,2,2,2,3,2,3,3,3,3,2,3,2,0,2,0,2,3,3,0,3,0,2,2,2,2,0, + 3,3,3,3,3,3,2,2,3,3,2,3,2,3,2,2,2,3,2,3,3,2,3,2,0,2,2,2,2,3,2,3,0,2,2,2,2,2, + 3,3,3,3,3,2,2,2,3,2,3,0,2,0,2,2,0,3,0,3,3,2,0,2,0,0,0,3,2,3,0,3,0,0,0,0,0,0, + 3,3,2,3,3,2,2,2,3,2,0,0,0,0,0,2,2,3,0,2,3,0,0,0,0,0,0,0,3,3,3,3,0,0,2,2,0,0, + 3,3,3,3,3,3,2,3,3,3,3,2,2,3,3,2,2,3,0,3,2,3,2,2,2,2,3,0,2,2,2,2,0,0,2,0,2,0, + 3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,3,2,3,2,3,3,2,2,2,0,0,3,3,3,3,2,2,0,2,2,2,0,0, + 2,0,3,0,0,3,3,3,2,3,3,3,3,3,3,0,3,0,2,0,0,2,2,2,2,0,0,2,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,2,3,3,2,3,0,3,2,2,2,0,3,2,2,3,2,2,2,0,0,2,2,3,3,2,3,0,2,2,2,0,0, + 2,3,3,2,2,3,3,3,2,3,3,3,3,3,3,3,3,0,3,2,0,2,2,2,3,2,0,3,2,0,0,0,0,0,2,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,2,2,3,0,3,2,3,2,3,2,2,2,2,3,2,0,0,2,2,2,2,0,0,2,0,0,0, + 3,3,3,3,3,2,3,3,3,2,2,3,3,3,2,2,2,3,2,3,2,2,0,0,0,2,0,0,2,2,2,2,0,0,2,0,0,0, + 3,3,2,3,3,2,0,2,3,3,3,2,2,2,0,0,2,2,2,2,0,0,0,2,0,2,3,2,3,2,0,0,0,0,0,0,2,2, + 3,3,0,2,3,0,0,0,2,2,0,0,2,0,0,2,0,2,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,2,0,0,0, + 3,3,2,3,3,3,0,2,3,2,3,2,0,0,2,0,2,2,2,2,2,0,0,2,0,2,0,0,2,2,0,0,0,0,0,2,0,0, + 3,3,2,3,3,3,3,3,3,2,2,3,2,0,2,0,0,0,2,2,2,0,0,0,0,2,0,0,2,2,0,0,0,2,2,0,0,0, + 3,3,2,3,3,2,2,2,3,2,3,3,3,2,0,2,2,2,2,3,3,0,0,2,0,0,2,2,2,2,0,2,0,2,2,0,2,0, + 2,0,3,0,0,3,3,3,0,3,2,3,3,2,0,2,3,0,2,0,0,2,2,0,3,0,0,3,0,0,0,0,0,0,0,0,0,0, + 0,0,3,0,0,2,0,0,0,2,2,2,0,2,3,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,2,3,0,0,3,0,3,0,3,3,2,2,3,2,3,3,2,0,0,0,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,2,0,0,2,2,0,2,2,0,0,0,0,3,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0, + 3,3,2,2,3,2,2,0,2,0,2,0,0,0,0,0,2,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0, + 2,0,2,0,2,0,2,0,0,2,0,2,2,0,0,0,2,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,2,0,0,2,2,2,0,2,2,2,2,0,0,0,2,0,0,0,0,0,0,2,0,2,0,2,0,0,0,0,0,0,0,0,0,0, + 0,0,2,0,0,2,2,0,0,0,0,2,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,2,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +}; + + +const SequenceModel Iso_8859_10LithuanianModel = +{ + Iso_8859_10_CharToOrderMap, + LithuanianLangModel, + 38, + (float)0.9928710196247589, + PR_TRUE, + "ISO-8859-10" +}; + +const SequenceModel Iso_8859_4LithuanianModel = +{ + Iso_8859_4_CharToOrderMap, + LithuanianLangModel, + 38, + (float)0.9928710196247589, + PR_TRUE, + "ISO-8859-4" +}; + +const SequenceModel Iso_8859_13LithuanianModel = +{
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/src/LangModels/LangMalteseModel.cpp
Added
@@ -0,0 +1,137 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Maltese *********/ + +/** + * Generated by BuildLangModel.py + * On: 2016-09-21 02:07:45.509404 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Iso_8859_3_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 15, 28, 13, 4, 16, 19, 22, 1, 9, 12, 3, 10, 5, 8, /* 4X */ + 14, 27, 6, 11, 2, 7, 26, 18, 25, 30, 20,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 15, 28, 13, 4, 16, 19, 22, 1, 9, 12, 3, 10, 5, 8, /* 6X */ + 14, 27, 6, 11, 2, 7, 26, 18, 25, 30, 20,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 17,SYM,SYM,SYM,ILL, 48,SYM,SYM, 49, 50, 51, 52,SYM,ILL, 21, /* AX */ + SYM, 17,SYM,SYM,SYM,SYM, 53,SYM,SYM, 54, 55, 56, 57,SYM,ILL, 21, /* BX */ + 29, 36, 47,ILL, 58, 24, 59, 40, 33, 31, 60, 39, 45, 35, 61, 62, /* CX */ + ILL, 37, 32, 34, 44, 23, 38,SYM, 63, 43, 42, 64, 46, 65, 66, 41, /* DX */ + 29, 36, 47,ILL, 67, 24, 68, 40, 33, 31, 69, 39, 45, 35, 70, 71, /* EX */ + ILL, 37, 32, 34, 44, 23, 38,SYM, 72, 43, 42, 73, 46, 74, 75,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 870 + * First 512 sequences: 0.9959115850692665 + * Next 512 sequences (512-1024): 0.004088414930733575 + * Rest: -4.423544863740858e-17 + * Negative sequences: TODO + */ +static const PRUint8 MalteseLangModel = +{ + 3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,0,2,3,3,3,3,3,2,0,3,0,0,3,3,3,2,3,3, + 3,3,3,3,3,2,2,3,3,3,3,3,3,3,2,3,3,2,3,3,2,0,3,3,0,3,3,3,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3, + 3,3,3,3,3,3,2,3,3,3,2,3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,3,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,3,3,2,3,3,3,3,3,2,3,0,3, + 3,3,3,3,3,3,3,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,2, + 3,3,2,2,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,0,3,3,2,2,2,2,2,0,0,0, + 3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,0,2,3,2,2,3,2,2,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,0,0,3,2,0,0,3,3,3,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,0,2,3,3,2,3,0,0,0,2,0,3,2,0,0,0,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,2,3,3,2,2,0,3,0,0,2,2,0,2,2,2, + 3,3,2,3,3,2,3,3,3,3,2,3,2,2,3,0,0,0,2,3,0,0,3,0,2,0,2,0,2,0,0, + 3,3,3,3,3,3,3,3,3,3,0,3,3,3,0,3,2,3,3,3,0,3,2,0,0,2,0,3,3,0,2, + 3,3,3,3,3,3,3,3,3,3,0,3,2,2,0,2,3,0,0,2,0,0,2,0,0,0,0,2,2,0,2, + 3,3,3,3,3,2,3,3,3,3,3,3,2,3,0,3,2,3,2,0,0,2,3,2,0,2,0,3,0,0,0, + 3,3,3,3,3,3,3,2,3,2,2,3,3,3,2,2,2,2,3,2,0,2,2,3,2,3,2,2,0,0,2, + 3,3,2,3,3,3,3,3,3,2,2,2,2,3,2,2,0,3,3,3,2,3,3,0,0,0,3,0,2,2,3, + 3,3,2,2,3,2,2,3,2,3,2,0,0,0,2,0,0,0,2,2,3,0,0,0,0,0,2,2,0,0,0, + 3,3,2,3,3,2,0,3,3,3,3,0,0,3,0,2,2,0,2,3,0,3,0,0,0,0,3,0,0,0,0, + 3,3,3,2,3,2,3,3,3,0,3,2,2,2,2,2,0,0,2,0,2,0,2,0,0,0,0,2,0,0,2, + 3,3,2,2,3,3,3,3,3,3,2,0,0,3,0,2,0,2,2,3,2,2,0,3,0,0,2,0,0,2,0, + 3,3,2,2,3,0,2,2,0,3,0,0,2,0,2,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0, + 3,3,3,3,3,2,3,3,3,3,3,0,2,2,0,3,2,0,2,0,0,0,3,0,0,3,2,0,2,0,0, + 3,3,0,2,3,2,3,3,3,3,0,2,0,3,2,0,0,0,0,0,0,0,0,0,0,0,3,0,2,0,2, + 3,3,3,2,3,0,3,3,3,3,2,3,2,3,0,3,3,0,3,3,0,0,2,2,2,2,0,3,0,2,0, + 3,3,3,3,3,0,2,2,3,2,0,3,3,3,0,2,3,0,0,0,2,0,3,0,0,0,0,2,2,0,2, + 0,2,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,0,0,2,0,0,0,0,0,0,0,2,0,2,0,2, +}; + + +const SequenceModel Iso_8859_3MalteseModel = +{ + Iso_8859_3_CharToOrderMap, + MalteseLangModel, + 31, + (float)0.9959115850692665, + PR_TRUE, + "ISO-8859-3" +};
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/src/LangModels/LangNorwegianModel.cpp
Added
@@ -0,0 +1,323 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Norwegian *********/ + +/** + * Generated by BuildLangModel.py + * On: 2022-01-28 21:58:11.143599 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Ibm865_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 5, 17, 23, 9, 0, 14, 11, 18, 6, 22, 10, 7, 12, 2, 8, /* 4X */ + 15, 29, 1, 4, 3, 16, 13, 25, 27, 20, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 5, 17, 23, 9, 0, 14, 11, 18, 6, 22, 10, 7, 12, 2, 8, /* 6X */ + 15, 29, 1, 4, 3, 16, 13, 25, 27, 20, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ + 43, 32, 28, 50, 31, 45, 19, 43, 53, 42, 41, 57, 61, 58, 31, 19, /* 8X */ + 28, 24, 24, 37, 30, 54, 63, 59, 64, 30, 32, 21,SYM, 21,SYM,SYM, /* 9X */ + 36, 33, 35, 40, 44, 44,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* DX */ + 48, 46, 65, 66, 60, 60, 67, 62, 68, 69, 70, 71, 72, 73, 52,SYM, /* EX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_15_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 5, 17, 23, 9, 0, 14, 11, 18, 6, 22, 10, 7, 12, 2, 8, /* 4X */ + 15, 29, 1, 4, 3, 16, 13, 25, 27, 20, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 5, 17, 23, 9, 0, 14, 11, 18, 6, 22, 10, 7, 12, 2, 8, /* 6X */ + 15, 29, 1, 4, 3, 16, 13, 25, 27, 20, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM, 47,SYM, 47,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM, 49, 74,SYM,SYM, 49,SYM,SYM,SYM, 51, 51, 75,SYM, /* BX */ + 45, 36, 50, 55, 31, 19, 24, 43, 41, 28, 53, 42, 58, 33, 61, 57, /* CX */ + 34, 44, 54, 35, 37, 56, 30,SYM, 21, 59, 40, 76, 32, 39, 38, 46, /* DX */ + 45, 36, 50, 55, 31, 19, 24, 43, 41, 28, 53, 42, 58, 33, 61, 57, /* EX */ + 34, 44, 54, 35, 37, 56, 30,SYM, 21, 59, 40, 77, 32, 39, 38, 78, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_1_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 5, 17, 23, 9, 0, 14, 11, 18, 6, 22, 10, 7, 12, 2, 8, /* 4X */ + 15, 29, 1, 4, 3, 16, 13, 25, 27, 20, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 5, 17, 23, 9, 0, 14, 11, 18, 6, 22, 10, 7, 12, 2, 8, /* 6X */ + 15, 29, 1, 4, 3, 16, 13, 25, 27, 20, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 79,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 45, 36, 50, 55, 31, 19, 24, 43, 41, 28, 53, 42, 58, 33, 61, 57, /* CX */ + 34, 44, 54, 35, 37, 56, 30,SYM, 21, 59, 40, 80, 32, 39, 38, 46, /* DX */ + 45, 36, 50, 55, 31, 19, 24, 43, 41, 28, 53, 42, 58, 33, 61, 57, /* EX */ + 34, 44, 54, 35, 37, 56, 30,SYM, 21, 59, 40, 81, 32, 39, 38, 82, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Windows_1252_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 5, 17, 23, 9, 0, 14, 11, 18, 6, 22, 10, 7, 12, 2, 8, /* 4X */ + 15, 29, 1, 4, 3, 16, 13, 25, 27, 20, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 5, 17, 23, 9, 0, 14, 11, 18, 6, 22, 10, 7, 12, 2, 8, /* 6X */ + 15, 29, 1, 4, 3, 16, 13, 25, 27, 20, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM, 83,SYM,SYM,SYM,SYM,SYM,SYM, 47,SYM, 51,ILL, 49,ILL, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 47,SYM, 51,ILL, 49, 84, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 85,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 45, 36, 50, 55, 31, 19, 24, 43, 41, 28, 53, 42, 58, 33, 61, 57, /* CX */ + 34, 44, 54, 35, 37, 56, 30,SYM, 21, 59, 40, 86, 32, 39, 38, 46, /* DX */ + 45, 36, 50, 55, 31, 19, 24, 43, 41, 28, 53, 42, 58, 33, 61, 57, /* EX */ + 34, 44, 54, 35, 37, 56, 30,SYM, 21, 59, 40, 87, 32, 39, 38, 88, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 991 + * First 512 sequences: 0.9975864274305254 + * Next 512 sequences (512-1024): 0.002413572569474574 + * Rest: 3.5128150388530344e-17 + * Negative sequences: TODO + */ +static const PRUint8 NorwegianLangModel = +{ + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,0,2,0, + 0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2, + 2,2,2,2,2,2,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,2,2, + 2,2,2,0,0,2,0,0,2,2,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,0,3,2,0, + 2,2,2,0,2,0,0,0,2,0,0,2,0,0,2,0,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,0,2,2,2, + 2,2,0,0,0,2,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,2,3,3,0,3,3,3,0,2,0, + 0,0,2,2,0,0,0,0,0,2,0,2,0,2,0,2,2,0,2,0,0,0,0,0,0,0,2,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,2,3,2,2,3,2,2,2,0, + 0,0,0,2,2,0,0,0,0,0,2,2,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,2,2,2,2, + 2,2,2,0,2,2,0,2,0,0,2,0,0,0,0,0,0,0,0,2,0,0,2,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,0,2,3,0,3,2,3,0,2,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,0,2,0,2, + 0,2,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,0,0,0,0,2, + 0,0,0,0,2,2,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,2,3,2,2,2,0,0,2,2,2, + 2,2,0,2,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,0,2,0,2,2,2, + 2,2,0,0,0,2,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,0,3,2,0,0,2,0,0, + 2,0,2,0,0,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,2,2,3,2,2,3,0,3,2,2,3,3,3,3,3,3,0,0,0,2,0,2, + 0,2,0,0,2,0,2,0,0,0,0,0,0,0,2,0,0,0,0,0,2,0,0,2,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,0,2,0,0, + 2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,2,3,3,0,2,2,2,3,2,2,3,2,2,2,0, + 0,0,0,2,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,2,3,3,3,3,3,2,2,2,2,0,2,2,3,3,2,3,3,3,3,2,3,2,2,0,2,0,2, + 2,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,2,2,2,3,3,2,2,3,2,2,3,3,3,3,2,3,2,2,0,2,0,2, + 2,2,2,0,2,2,0,0,0,0,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,2,2,3,2,3,3,3,2,3,3,3,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,0,2,0,2,3,2,2,2,2,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/src/LangModels/LangPolishModel.cpp
Added
@@ -0,0 +1,298 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Polish *********/ + +/** + * Generated by BuildLangModel.py + * On: 2016-09-21 17:21:04.405363 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Ibm852_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 20, 11, 14, 3, 26, 21, 22, 1, 18, 7, 15, 16, 5, 2, /* 4X */ + 13, 36, 4, 6, 10, 17, 31, 9, 33, 12, 8,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 20, 11, 14, 3, 26, 21, 22, 1, 18, 7, 15, 16, 5, 2, /* 6X */ + 13, 36, 4, 6, 10, 17, 31, 9, 33, 12, 8,SYM,SYM,SYM,SYM,CTR, /* 7X */ + 47, 39, 34, 54, 40, 78, 30, 47, 19, 58, 49, 49, 77, 32, 40, 30, /* 8X */ + 34, 79, 80, 55, 38, 74, 74, 28, 28, 38, 39, 76, 76, 19,SYM, 44, /* 9X */ + 35, 37, 24, 51, 25, 25, 45, 45, 23, 23,SYM, 32, 44, 56,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 35, 54, 46, 56,SYM,SYM,SYM,SYM, 27, 27,SYM, /* BX */ + SYM,SYM,SYM,SYM,SYM,SYM, 53, 53,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */ + 70, 70, 69, 58, 69, 81, 37, 77, 46,SYM,SYM,SYM,SYM, 65, 82,SYM, /* DX */ + 24, 57, 55, 29, 29, 83, 41, 41, 84, 51, 85, 86, 60, 60, 65,SYM, /* EX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 87, 50, 50,SYM,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_16_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 20, 11, 14, 3, 26, 21, 22, 1, 18, 7, 15, 16, 5, 2, /* 4X */ + 13, 36, 4, 6, 10, 17, 31, 9, 33, 12, 8,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 20, 11, 14, 3, 26, 21, 22, 1, 18, 7, 15, 16, 5, 2, /* 6X */ + 13, 36, 4, 6, 10, 17, 31, 9, 33, 12, 8,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 25, 25, 19,SYM,SYM, 41,SYM, 41,SYM, 62,SYM, 32,SYM, 32, 27, /* AX */ + SYM,SYM, 44, 19, 45,SYM,SYM,SYM, 45, 44, 62,SYM, 75, 75, 88, 27, /* BX */ + 61, 35, 54, 53, 40, 30, 89, 47, 43, 34, 64, 58, 90, 37, 77, 91, /* CX */ + 70, 29, 66, 24, 55, 49, 38, 28, 92, 68, 51, 93, 39, 23, 72, 57, /* DX */ + 61, 35, 54, 53, 40, 30, 94, 47, 43, 34, 64, 58, 95, 37, 77, 96, /* EX */ + 70, 29, 66, 24, 55, 49, 38, 28, 97, 68, 51, 98, 39, 23, 72, 99, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_2_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 20, 11, 14, 3, 26, 21, 22, 1, 18, 7, 15, 16, 5, 2, /* 4X */ + 13, 36, 4, 6, 10, 17, 31, 9, 33, 12, 8,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 20, 11, 14, 3, 26, 21, 22, 1, 18, 7, 15, 16, 5, 2, /* 6X */ + 13, 36, 4, 6, 10, 17, 31, 9, 33, 12, 8,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 25,SYM, 19,SYM, 74, 28,SYM,SYM, 41, 56, 76, 32,SYM, 45, 27, /* AX */ + SYM, 25,SYM, 19,SYM, 74, 28,SYM,SYM, 41, 56, 76, 32,SYM, 45, 27, /* BX */ + 100, 35, 54, 53, 40,101, 30, 47, 44, 34, 23, 58, 46, 37, 77, 69, /* CX */ + 70, 29,102, 24, 55, 49, 38,SYM, 50,103, 51,104, 39, 60, 65, 57, /* DX */ + 105, 35, 54, 53, 40,106, 30, 47, 44, 34, 23, 58, 46, 37, 77, 69, /* EX */ + 70, 29,107, 24, 55, 49, 38,SYM, 50,108, 51,109, 39, 60, 65,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Mac_Centraleurope_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 20, 11, 14, 3, 26, 21, 22, 1, 18, 7, 15, 16, 5, 2, /* 4X */ + 13, 36, 4, 6, 10, 17, 31, 9, 33, 12, 8,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 20, 11, 14, 3, 26, 21, 22, 1, 18, 7, 15, 16, 5, 2, /* 6X */ + 13, 36, 4, 6, 10, 17, 31, 9, 33, 12, 8,SYM,SYM,SYM,SYM,CTR, /* 7X */ + 40, 63, 63, 34, 25, 38, 39, 35, 25, 44, 40, 44, 30, 30, 34, 32, /* 8X */ + 32, 69, 37, 69,110,111, 71, 24, 71, 55, 38, 67, 51, 46, 46, 39, /* 9X */ + SYM,SYM, 23,SYM,SYM,SYM,SYM, 57,SYM,SYM,SYM, 23,SYM,SYM,112,113, /* AX */ + 114, 73,SYM,SYM, 73,115,SYM,SYM, 19,116,117, 74, 74,118,119,120, /* BX */ + 121, 29,SYM,SYM, 29,122,SYM,SYM,SYM,SYM,SYM,123, 49, 67, 49, 42, /* CX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 42,124,125, 50,SYM,SYM, 50,126, /* DX */ + 127, 41,SYM,SYM, 41, 28, 28, 35, 76, 76, 37, 45, 45, 59, 24, 55, /* EX */ + 59,128, 51,129,130,131,132,133, 60, 60,134, 27, 19, 27,135,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_13_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 20, 11, 14, 3, 26, 21, 22, 1, 18, 7, 15, 16, 5, 2, /* 4X */ + 13, 36, 4, 6, 10, 17, 31, 9, 33, 12, 8,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 20, 11, 14, 3, 26, 21, 22, 1, 18, 7, 15, 16, 5, 2, /* 6X */ + 13, 36, 4, 6, 10, 17, 31, 9, 33, 12, 8,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 48,SYM,136,SYM,SYM,SYM,SYM,137, /* AX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 48,SYM,138,SYM,SYM,SYM,SYM,139, /* BX */ + 25,140, 63, 30, 40, 52, 23,141, 44, 34, 32, 71,142,143, 73,144, /* CX */ + 41, 29,145, 24, 42, 67, 38,SYM,146, 19, 28, 59, 39, 27, 45, 57, /* DX */ + 25,147, 63, 30, 40, 52, 23,148, 44, 34, 32, 71,149,150, 73,151, /* EX */ + 41, 29,152, 24, 42, 67, 38,SYM,153, 19, 28, 59, 39, 27, 45,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Windows_1250_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 20, 11, 14, 3, 26, 21, 22, 1, 18, 7, 15, 16, 5, 2, /* 4X */ + 13, 36, 4, 6, 10, 17, 31, 9, 33, 12, 8,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 20, 11, 14, 3, 26, 21, 22, 1, 18, 7, 15, 16, 5, 2, /* 6X */ + 13, 36, 4, 6, 10, 17, 31, 9, 33, 12, 8,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM,ILL,SYM,SYM,SYM,SYM,ILL,SYM, 41,SYM, 28, 76, 45, 32, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM, 41,SYM, 28, 76, 45, 32, /* 9X */ + SYM,SYM,SYM, 19,SYM, 25,SYM,SYM,SYM,SYM, 56,SYM,SYM,SYM,SYM, 27, /* AX */ + SYM,SYM,SYM, 19,SYM,SYM,SYM,SYM,SYM, 25, 56,SYM, 74,SYM, 74, 27, /* BX */ + 154, 35, 54, 53, 40,155, 30, 47, 44, 34, 23, 58, 46, 37, 77, 69, /* CX */ + 70, 29,156, 24, 55, 49, 38,SYM, 50,157, 51,158, 39, 60, 65, 57, /* DX */ + 159, 35, 54, 53, 40,160, 30, 47, 44, 34, 23, 58, 46, 37, 77, 69, /* EX */ + 70, 29,161, 24, 55, 49, 38,SYM, 50,162, 51,163, 39, 60, 65,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 1321 + * First 512 sequences: 0.9894531815946438 + * Next 512 sequences (512-1024): 0.010193795364991133 + * Rest: 0.0003530230403650733 + * Negative sequences: TODO + */ +static const PRUint8 PolishLangModel = +{
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/src/LangModels/LangPortugueseModel.cpp
Added
@@ -0,0 +1,237 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Portuguese *********/ + +/** + * Generated by BuildLangModel.py + * On: 2016-09-20 23:47:27.348423 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Iso_8859_1_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 17, 10, 7, 1, 16, 14, 18, 4, 28, 34, 12, 9, 6, 2, /* 4X */ + 13, 21, 5, 3, 8, 11, 15, 32, 24, 31, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 17, 10, 7, 1, 16, 14, 18, 4, 28, 34, 12, 9, 6, 2, /* 6X */ + 13, 21, 5, 3, 8, 11, 15, 32, 24, 31, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 51,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 36, 25, 35, 20, 41, 42, 43, 22, 39, 19, 29, 44, 52, 23, 45, 47, /* CX */ + 48, 53, 46, 27, 37, 30, 38,SYM, 54, 55, 33, 56, 40, 57, 58, 49, /* DX */ + 36, 25, 35, 20, 41, 42, 43, 22, 39, 19, 29, 44, 59, 23, 45, 47, /* EX */ + 48, 60, 46, 27, 37, 30, 38,SYM, 61, 62, 33, 63, 40, 64, 65, 50, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_9_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 17, 10, 7, 1, 16, 14, 18, 4, 28, 34, 12, 9, 6, 2, /* 4X */ + 13, 21, 5, 3, 8, 11, 15, 32, 24, 31, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 17, 10, 7, 1, 16, 14, 18, 4, 28, 34, 12, 9, 6, 2, /* 6X */ + 13, 21, 5, 3, 8, 11, 15, 32, 24, 31, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 66,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 36, 25, 35, 20, 41, 42, 43, 22, 39, 19, 29, 44, 67, 23, 45, 47, /* CX */ + 68, 69, 46, 27, 37, 30, 38,SYM, 70, 71, 33, 72, 40, 73, 74, 49, /* DX */ + 36, 25, 35, 20, 41, 42, 43, 22, 39, 19, 29, 44, 75, 23, 45, 47, /* EX */ + 76, 77, 46, 27, 37, 30, 38,SYM, 78, 79, 33, 80, 40, 81, 82, 50, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_15_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 17, 10, 7, 1, 16, 14, 18, 4, 28, 34, 12, 9, 6, 2, /* 4X */ + 13, 21, 5, 3, 8, 11, 15, 32, 24, 31, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 17, 10, 7, 1, 16, 14, 18, 4, 28, 34, 12, 9, 6, 2, /* 6X */ + 13, 21, 5, 3, 8, 11, 15, 32, 24, 31, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM, 83,SYM, 84,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM, 85, 86,SYM,SYM, 87,SYM,SYM,SYM, 88, 89, 50,SYM, /* BX */ + 36, 25, 35, 20, 41, 42, 43, 22, 39, 19, 29, 44, 90, 23, 45, 47, /* CX */ + 48, 91, 46, 27, 37, 30, 38,SYM, 92, 93, 33, 94, 40, 95, 96, 49, /* DX */ + 36, 25, 35, 20, 41, 42, 43, 22, 39, 19, 29, 44, 97, 23, 45, 47, /* EX */ + 48, 98, 46, 27, 37, 30, 38,SYM, 99,100, 33,101, 40,102,103, 50, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Windows_1252_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 17, 10, 7, 1, 16, 14, 18, 4, 28, 34, 12, 9, 6, 2, /* 4X */ + 13, 21, 5, 3, 8, 11, 15, 32, 24, 31, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 17, 10, 7, 1, 16, 14, 18, 4, 28, 34, 12, 9, 6, 2, /* 6X */ + 13, 21, 5, 3, 8, 11, 15, 32, 24, 31, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM,104,SYM,SYM,SYM,SYM,SYM,SYM,105,SYM,106,ILL,107,ILL, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,108,SYM,109,ILL,110, 50, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM,111,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 36, 25, 35, 20, 41, 42, 43, 22, 39, 19, 29, 44,112, 23, 45, 47, /* CX */ + 48,113, 46, 27, 37, 30, 38,SYM,114,115, 33,116, 40,117,118, 49, /* DX */ + 36, 25, 35, 20, 41, 42, 43, 22, 39, 19, 29, 44,119, 23, 45, 47, /* EX */ + 48,120, 46, 27, 37, 30, 38,SYM,121,122, 33,123, 40,124,125, 50, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 891 + * First 512 sequences: 0.9953179582313172 + * Next 512 sequences (512-1024): 0.0046820417686827855 + * Rest: 2.42861286636753e-17 + * Negative sequences: TODO + */ +static const PRUint8 PortugueseLangModel = +{ + 2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,3,3,3,3,0,3,2,3,0,0,3,2,2,3,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,2,3,3,2,3,2,3,2,3,0,2,3,3,2,2,2,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,3,2,3,2,3,2,3,0,2,3,3,0,3,0,0,0, + 3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,3,3,3,3,0,3,0,3,2,3,0,2,3,3,2,2,3,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,0,3,3,3,3,2,3,3,2,2,2,3,2,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,3,2,3,3,3,2,2,3,3,0,3, + 3,3,3,3,3,2,3,3,3,2,3,3,2,2,3,3,3,2,3,3,3,3,3,3,2,3,3,3,3,2,0,3,2,3,3,2,0,3, + 3,3,3,3,3,3,2,3,2,3,2,3,3,2,3,2,2,2,2,3,3,2,0,3,0,3,0,3,2,3,2,3,3,3,0,2,0,2, + 3,3,3,3,3,3,3,0,3,3,3,3,3,2,2,2,2,2,3,3,3,0,0,3,0,3,2,3,0,3,2,3,2,2,2,3,0,3, + 3,3,3,3,3,2,3,2,2,3,2,3,2,3,2,0,2,3,0,3,3,2,0,3,0,3,2,3,0,2,2,3,2,3,0,3,0,3, + 3,3,3,2,3,3,3,2,3,3,3,3,3,2,2,0,2,2,3,3,2,2,3,3,0,3,2,3,0,3,2,3,0,2,3,3,0,2, + 3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,2,3,2,2,3,3,3,2,3,0,3,3,0,2,2,0,2,0,0,0, + 3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,0,3,2,3,0,3,0,3,2,2,2,3,0,3, + 3,3,3,3,3,3,2,2,3,0,2,3,3,3,0,0,0,2,3,3,2,2,3,3,0,3,2,3,0,2,2,2,0,3,0,2,0,2, + 3,3,3,3,3,3,3,2,2,3,2,3,3,2,2,2,0,2,3,3,2,0,0,2,0,3,0,2,0,3,2,3,2,2,0,2,0,0, + 3,3,3,0,3,3,0,2,0,0,0,3,0,0,0,2,0,0,0,3,2,0,0,3,0,3,0,2,0,3,2,0,0,0,0,2,0,2, + 3,3,3,2,3,3,0,2,2,2,2,3,3,2,2,0,3,2,0,3,0,0,0,3,0,2,0,3,0,3,0,2,0,2,0,0,0,2, + 3,3,3,3,3,3,3,3,3,2,2,3,3,2,2,2,3,2,2,3,2,0,0,2,0,2,2,2,3,2,0,2,2,2,0,0,0,0, + 3,3,3,3,3,3,3,2,3,2,0,3,3,0,0,0,2,2,2,2,3,0,0,2,0,3,0,2,0,0,3,3,2,0,2,0,0,0, + 2,2,2,3,2,3,3,3,3,3,3,2,3,3,2,2,2,0,0,0,0,0,0,0,2,0,2,0,0,0,0,0,0,0,0,0,0,0, + 0,2,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,0,0,2,0,0,0,0,0,0,0,3,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,0,3,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,3,0,0,2,0,0,0,0, + 3,0,3,3,0,3,3,3,3,3,3,0,3,3,3,3,3,3,0,0,0,2,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,2,3,0,0,0,3,0,3,3,2,3,0,3,2,0,2,2,2,0,0,2,3,2,0,2,2,0,2,0,0,0,0,0,0,2, + 0,0,0,3,0,3,2,2,3,0,3,2,3,3,3,3,3,3,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,0,3,2,2,0,0,2,2,3,0,0,0,0,0,2,2,2,2,0,0,0,0,2,2,2,0,2,2,0,2,0,0,2,0,0, + 0,0,0,3,2,3,3,3,3,3,3,0,3,3,3,2,3,2,0,0,0,2,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,2,2,0,2,0,0,0,2,3,0,2,0,0,0,0,0,0,2,0,0,0,0,3,0,0,0,0,0,0,0,2,2,0,0,0, + 0,0,0,3,0,0,3,0,2,3,0,2,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,2,3,2,2,3,2,3,2,3,2,2,0,2,2,2,0,0,0,0,0,3,0,2,0,2,0,0,0,2,0,2,0,0,0, + 3,3,3,2,3,2,2,2,3,2,2,2,2,0,0,2,0,2,3,0,0,0,0,0,0,0,2,0,0,0,0,2,2,0,2,0,0,0, + 0,0,0,3,0,2,3,3,2,3,2,0,3,2,0,2,0,3,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,2,3,2,2,0,0,3,2,2,2,0,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,2,0,0,0, + 0,0,0,0,0,0,3,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,2,2,0,0,3,2,2,3,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +}; + + +const SequenceModel Iso_8859_1PortugueseModel =
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/src/LangModels/LangRomanianModel.cpp
Added
@@ -0,0 +1,232 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Romanian *********/ + +/** + * Generated by BuildLangModel.py + * On: 2016-09-28 18:58:13.757152 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Iso_8859_16_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 2, 17, 9, 11, 0, 16, 15, 23, 1, 26, 27, 6, 12, 4, 8, /* 4X */ + 13, 32, 3, 10, 5, 7, 21, 29, 25, 28, 22,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 2, 17, 9, 11, 0, 16, 15, 23, 1, 26, 27, 6, 12, 4, 8, /* 6X */ + 13, 32, 3, 10, 5, 7, 21, 29, 25, 28, 22,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 60, 61, 46,SYM,SYM, 38,SYM, 38,SYM, 19,SYM, 62,SYM, 63, 64, /* AX */ + SYM,SYM, 41, 46, 40,SYM,SYM,SYM, 40, 41, 19,SYM, 65, 66, 67, 68, /* BX */ + 69, 30, 24, 14, 33, 35, 53, 42, 45, 31, 58, 49, 70, 37, 20, 48, /* CX */ + 43, 52, 59, 34, 71, 44, 36, 56, 50, 72, 47, 73, 39, 74, 18, 57, /* DX */ + 75, 30, 24, 14, 33, 35, 53, 42, 45, 31, 58, 49, 76, 37, 20, 48, /* EX */ + 43, 52, 59, 34, 77, 44, 36, 56, 50, 78, 47, 79, 39, 80, 18, 81, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_2_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 2, 17, 9, 11, 0, 16, 15, 23, 1, 26, 27, 6, 12, 4, 8, /* 4X */ + 13, 32, 3, 10, 5, 7, 21, 29, 25, 28, 22,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 2, 17, 9, 11, 0, 16, 15, 23, 1, 26, 27, 6, 12, 4, 8, /* 6X */ + 13, 32, 3, 10, 5, 7, 21, 29, 25, 28, 22,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 82,SYM, 46,SYM, 83, 56,SYM,SYM, 38, 84, 85, 86,SYM, 40, 87, /* AX */ + SYM, 88,SYM, 46,SYM, 89, 56,SYM,SYM, 38, 90, 91, 92,SYM, 40, 93, /* BX */ + 94, 30, 24, 14, 33, 95, 35, 42, 41, 31, 96, 49, 51, 37, 20, 97, /* CX */ + 43, 52, 98, 34, 99, 44, 36,SYM, 55,100, 47, 50, 39, 54,101, 57, /* DX */ + 102, 30, 24, 14, 33,103, 35, 42, 41, 31,104, 49, 51, 37, 20,105, /* EX */ + 43, 52,106, 34,107, 44, 36,SYM, 55,108, 47, 50, 39, 54,109,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Windows_1250_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 2, 17, 9, 11, 0, 16, 15, 23, 1, 26, 27, 6, 12, 4, 8, /* 4X */ + 13, 32, 3, 10, 5, 7, 21, 29, 25, 28, 22,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 2, 17, 9, 11, 0, 16, 15, 23, 1, 26, 27, 6, 12, 4, 8, /* 6X */ + 13, 32, 3, 10, 5, 7, 21, 29, 25, 28, 22,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM,ILL,SYM,SYM,SYM,SYM,ILL,SYM, 38,SYM, 56,110, 40,111, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM, 38,SYM, 56,112, 40,113, /* 9X */ + SYM,SYM,SYM, 46,SYM,114,SYM,SYM,SYM,SYM,115,SYM,SYM,SYM,SYM,116, /* AX */ + SYM,SYM,SYM, 46,SYM,SYM,SYM,SYM,SYM,117,118,SYM,119,SYM,120,121, /* BX */ + 122, 30, 24, 14, 33,123, 35, 42, 41, 31,124, 49, 51, 37, 20,125, /* CX */ + 43, 52,126, 34,127, 44, 36,SYM, 55,128, 47, 50, 39, 54,129, 57, /* DX */ + 130, 30, 24, 14, 33,131, 35, 42, 41, 31,132, 49, 51, 37, 20,133, /* EX */ + 43, 52,134, 34,135, 44, 36,SYM, 55,136, 47, 50, 39, 54,137,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Ibm852_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 2, 17, 9, 11, 0, 16, 15, 23, 1, 26, 27, 6, 12, 4, 8, /* 4X */ + 13, 32, 3, 10, 5, 7, 21, 29, 25, 28, 22,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 2, 17, 9, 11, 0, 16, 15, 23, 1, 26, 27, 6, 12, 4, 8, /* 6X */ + 13, 32, 3, 10, 5, 7, 21, 29, 25, 28, 22,SYM,SYM,SYM,SYM,CTR, /* 7X */ + 42, 39, 31, 24, 33,138, 35, 42, 46, 49, 44, 44, 20,139, 33, 35, /* 8X */ + 31,140,141,142, 36,143,144, 56, 56, 36, 39,145,146, 46,SYM, 41, /* 9X */ + 30, 37, 34, 47,147,148, 40, 40,149,150,SYM,151, 41,152,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 30, 24, 51,153,SYM,SYM,SYM,SYM,154,155,SYM, /* BX */ + SYM,SYM,SYM,SYM,SYM,SYM, 14, 14,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */ + 43, 43,156, 49,157,158, 37, 20, 51,SYM,SYM,SYM,SYM,159,160,SYM, /* DX */ + 34, 57,161, 52, 52,162, 38, 38,163, 47,164, 50, 54, 54,165,SYM, /* EX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 50, 55, 55,SYM,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 981 + * First 512 sequences: 0.997762564143313 + * Next 512 sequences (512-1024): 0.002237435856687006 + * Rest: 3.0357660829594124e-18 + * Negative sequences: TODO + */ +static const PRUint8 RomanianLangModel = +{ + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,0,3,3,3,2,3,3,3,2,2,0,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,0,3,3,3,0,3,3,3,3,3,0,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,2,3,3,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,0,3,3,3,3,2,3,3,3,3,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,3,0,2,2,3,3,3,3,0,2,2,3,3,2,3,0, + 3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,2,2,3,0,3,3,3,2,2,2,0, + 3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,2,2,0,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,0,3,3,3,0,3,2,3,3,3,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,2,2,3,2,0,3,2,3,3,0,3,3,2,2,0,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,0,2,2,3,3,3,0,2,3,3,3,2,2,2, + 3,3,3,3,3,2,3,3,3,2,3,3,3,2,3,3,2,3,0,0,0,3,2,3,3,0,2,2,3,3,3,2,0, + 3,3,3,2,3,3,3,3,3,3,3,2,3,3,3,2,2,3,3,2,2,2,2,3,3,2,0,0,3,2,2,2,0, + 3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,0,0,2,3,0,2,0,2,3,3,0,2,2,3,0,2,2,0, + 2,3,0,3,3,3,3,3,0,3,3,3,3,3,0,3,0,3,3,3,0,3,3,0,0,0,2,2,0,0,0,0,0, + 3,3,3,3,3,2,3,3,3,0,2,3,3,2,3,3,2,3,0,0,2,3,2,3,3,0,2,0,3,2,2,2,0, + 3,3,3,3,0,3,3,3,3,2,2,2,3,2,3,2,3,0,0,0,0,0,0,2,3,0,0,0,2,0,2,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,3,3,2,0,2,2,2,3,0,2,2,3,2,2,2,0, + 3,3,3,0,0,0,0,3,2,2,2,0,0,0,3,0,0,0,0,0,2,2,0,0,2,0,0,2,0,0,0,0,0, + 3,3,3,0,3,3,3,3,3,3,0,2,2,0,3,0,0,0,0,0,0,2,0,0,2,0,0,2,0,0,0,0,0, + 0,3,0,2,3,0,3,0,0,0,0,0,3,0,0,0,0,0,2,3,0,0,2,2,0,0,0,2,0,0,0,0,0, + 3,3,3,3,3,2,3,3,3,2,2,3,2,0,3,2,2,2,0,0,0,0,0,0,3,0,2,2,2,0,2,0,0, + 3,3,3,2,2,2,2,3,3,0,2,3,2,2,3,2,0,3,0,0,0,3,3,2,3,0,0,2,2,0,2,2,0, + 3,3,3,3,3,3,3,3,3,2,3,2,2,2,3,0,2,3,0,0,0,2,2,0,2,0,2,2,3,2,2,2,0, + 0,3,0,3,3,3,3,3,0,2,2,2,3,0,0,0,0,0,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,2,0,3,0,3,3,3,2,0,0,3,3,0,3,0,0,0,0,3,0,2,2,3,0,0,3,0,0,0,0, + 3,3,3,2,2,2,3,3,3,0,2,2,2,0,2,0,0,2,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0, + 3,3,3,3,2,3,3,3,3,2,3,2,3,2,2,2,2,2,2,2,2,2,0,3,0,0,0,2,3,2,2,2,0, + 3,2,3,3,3,2,3,2,3,3,3,3,3,2,0,2,0,2,0,0,0,2,2,2,0,0,2,2,0,2,2,0,0, + 3,3,3,2,3,2,2,2,3,2,3,2,2,2,0,0,2,2,0,0,0,0,0,3,0,0,0,0,2,3,0,0,0, + 2,3,0,3,3,2,2,0,0,2,2,2,2,0,0,2,0,0,0,0,0,0,2,0,0,0,0,2,0,0,0,0,0, + 0,3,2,2,2,2,2,0,0,2,2,2,2,2,0,2,0,2,0,0,0,2,2,0,0,0,2,2,0,0,0,0,0, + 0,0,2,0,0,2,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2, +}; + + +const SequenceModel Iso_8859_16RomanianModel = +{ + Iso_8859_16_CharToOrderMap, + RomanianLangModel, + 33, + (float)0.997762564143313,
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/src/LangModels/LangSlovakModel.cpp
Added
@@ -0,0 +1,289 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Slovak *********/ + +/** + * Generated by BuildLangModel.py + * On: 2016-09-21 13:33:10.331339 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Ibm852_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 20, 15, 11, 2, 29, 30, 17, 4, 18, 7, 10, 12, 3, 0, /* 4X */ + 13, 40, 6, 8, 5, 14, 9, 37, 34, 19, 16,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 20, 15, 11, 2, 29, 30, 17, 4, 18, 7, 10, 12, 3, 0, /* 6X */ + 13, 40, 6, 8, 5, 14, 9, 37, 34, 19, 16,SYM,SYM,SYM,SYM,CTR, /* 7X */ + 51, 46, 25, 62, 38, 48, 47, 51, 49, 54, 50, 50, 63, 64, 38, 47, /* 8X */ + 25, 42, 42, 32, 43, 33, 33, 65, 66, 43, 46, 31, 31, 49,SYM, 24, /* 9X */ + 21, 23, 35, 27, 67, 68, 26, 26, 69, 70,SYM, 71, 24, 59,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 21, 72, 41, 59,SYM,SYM,SYM,SYM, 61, 61,SYM, /* BX */ + SYM,SYM,SYM,SYM,SYM,SYM, 56, 56,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */ + 55, 55, 39, 54, 39, 36, 23, 73, 41,SYM,SYM,SYM,SYM, 74, 48,SYM, /* DX */ + 35, 58, 32, 52, 52, 36, 28, 28, 44, 27, 44, 60, 22, 22, 75,SYM, /* EX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 60, 45, 45,SYM,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_2_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 20, 15, 11, 2, 29, 30, 17, 4, 18, 7, 10, 12, 3, 0, /* 4X */ + 13, 40, 6, 8, 5, 14, 9, 37, 34, 19, 16,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 20, 15, 11, 2, 29, 30, 17, 4, 18, 7, 10, 12, 3, 0, /* 6X */ + 13, 40, 6, 8, 5, 14, 9, 37, 34, 19, 16,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 76,SYM, 49,SYM, 33, 77,SYM,SYM, 28, 59, 31, 78,SYM, 26, 61, /* AX */ + SYM, 79,SYM, 49,SYM, 33, 80,SYM,SYM, 28, 59, 31, 81,SYM, 26, 61, /* BX */ + 44, 21, 82, 56, 38, 42, 47, 51, 24, 25, 83, 54, 41, 23, 84, 39, /* CX */ + 55, 52, 36, 35, 32, 50, 43,SYM, 45, 48, 27, 60, 46, 22, 85, 58, /* DX */ + 44, 21, 86, 56, 38, 42, 47, 51, 24, 25, 87, 54, 41, 23, 88, 39, /* EX */ + 55, 52, 36, 35, 32, 50, 43,SYM, 45, 48, 27, 60, 46, 22, 89,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Mac_Centraleurope_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 20, 15, 11, 2, 29, 30, 17, 4, 18, 7, 10, 12, 3, 0, /* 4X */ + 13, 40, 6, 8, 5, 14, 9, 37, 34, 19, 16,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 20, 15, 11, 2, 29, 30, 17, 4, 18, 7, 10, 12, 3, 0, /* 6X */ + 13, 40, 6, 8, 5, 14, 9, 37, 34, 19, 16,SYM,SYM,SYM,SYM,CTR, /* 7X */ + 38, 90, 91, 25, 92, 43, 46, 21, 93, 24, 38, 24, 47, 47, 25, 94, /* 8X */ + 95, 39, 23, 39, 96, 97, 98, 35, 99, 32, 43,100, 27, 41, 41, 46, /* 9X */ + SYM,SYM,101,SYM,SYM,SYM,SYM, 58,SYM,SYM,SYM,102,SYM,SYM,103,104, /* AX */ + 105, 57,SYM,SYM, 57,106,SYM,SYM, 49,107,108, 33, 33, 42, 42,109, /* BX */ + 110, 52,SYM,SYM, 52, 36,SYM,SYM,SYM,SYM,SYM, 36, 50,111, 50, 53, /* CX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 53, 44, 44, 45,SYM,SYM, 45,112, /* DX */ + 113, 28,SYM,SYM, 28,114,115, 21, 31, 31, 23, 26, 26,116, 35, 32, /* EX */ + 117, 48, 27, 48, 60, 60,118,119, 22, 22,120, 61, 49, 61,121,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Windows_1250_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 20, 15, 11, 2, 29, 30, 17, 4, 18, 7, 10, 12, 3, 0, /* 4X */ + 13, 40, 6, 8, 5, 14, 9, 37, 34, 19, 16,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 20, 15, 11, 2, 29, 30, 17, 4, 18, 7, 10, 12, 3, 0, /* 6X */ + 13, 40, 6, 8, 5, 14, 9, 37, 34, 19, 16,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM,ILL,SYM,SYM,SYM,SYM,ILL,SYM, 28,SYM,122, 31, 26,123, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM, 28,SYM,124, 31, 26,125, /* 9X */ + SYM,SYM,SYM, 49,SYM,126,SYM,SYM,SYM,SYM, 59,SYM,SYM,SYM,SYM, 61, /* AX */ + SYM,SYM,SYM, 49,SYM,SYM,SYM,SYM,SYM,127, 59,SYM, 33,SYM, 33, 61, /* BX */ + 44, 21,128, 56, 38, 42, 47, 51, 24, 25,129, 54, 41, 23,130, 39, /* CX */ + 55, 52, 36, 35, 32, 50, 43,SYM, 45, 48, 27, 60, 46, 22,131, 58, /* DX */ + 44, 21,132, 56, 38, 42, 47, 51, 24, 25,133, 54, 41, 23,134, 39, /* EX */ + 55, 52, 36, 35, 32, 50, 43,SYM, 45, 48, 27, 60, 46, 22,135,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 1181 + * First 512 sequences: 0.9733303573968434 + * Next 512 sequences (512-1024): 0.026317344239265295 + * Rest: 0.0003522983638913346 + * Negative sequences: TODO + */ +static const PRUint8 SlovakLangModel = +{ + 2,2,2,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2, + 0,0,3,2,3,1,2,3,3,1,0,3,2,0,3,2,0,1,2,0,0,0,0, + 2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0, + 0,0,3,0,3,0,3,3,3,3,0,2,3,1,2,2,0,2,2,0,0,0,0, + 3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3, + 0,2,3,0,3,2,3,3,3,2,0,3,3,3,3,2,0,3,2,0,0,1,0, + 3,3,3,3,3,3,2,3,3,2,2,3,2,2,3,3,2,2,2,3,2,3, + 3,3,3,3,2,3,3,2,3,0,2,0,0,2,0,2,0,0,2,2,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3, + 0,3,3,2,3,0,3,3,3,3,0,2,2,3,2,2,0,0,2,0,0,0,0, + 3,3,3,3,3,2,3,3,3,3,3,1,3,2,3,2,3,3,2,3,2,3, + 3,3,2,3,0,3,2,2,2,1,0,2,0,3,2,2,2,2,1,2,1,1,2, + 3,3,3,3,3,3,2,3,3,3,2,3,3,2,3,3,2,2,2,3,3,3, + 3,3,3,3,2,2,2,2,3,2,3,0,2,3,2,2,2,0,2,0,0,1,0, + 3,3,3,2,3,3,3,2,2,3,3,3,3,1,3,3,2,2,2,3,2,3, + 3,2,2,3,2,3,2,2,2,0,3,2,0,2,2,2,0,0,0,0,0,2,1, + 3,3,3,3,3,3,2,3,3,3,3,2,3,3,3,3,2,2,2,3,2,3, + 2,3,2,2,1,3,0,2,2,3,2,2,0,2,2,2,0,0,2,0,0,0,0, + 3,3,3,3,3,2,3,2,3,0,3,3,2,3,3,2,3,2,0,3,2,3, + 3,3,2,3,2,2,3,1,2,0,2,0,0,0,2,0,3,2,0,2,2,1,2, + 3,3,3,3,3,3,2,3,3,2,3,2,2,2,3,2,2,2,2,3,3,3, + 3,3,2,3,2,3,2,2,3,0,1,0,0,3,2,0,0,0,0,0,0,0,0, + 3,3,3,3,3,2,3,3,3,3,3,2,3,3,3,3,3,2,2,3,2,3, + 3,3,2,2,2,2,2,2,2,0,3,3,2,2,2,2,0,0,0,2,2,0,0, + 3,3,3,3,3,3,2,2,3,1,2,2,3,3,3,2,0,0,2,3,3,3, + 2,3,0,2,2,2,0,0,2,0,3,0,1,2,1,0,3,0,2,0,0,2,2, + 3,3,3,3,3,3,3,2,2,0,3,2,2,2,3,2,1,2,0,2,2,3, + 2,3,1,2,0,2,2,0,1,2,3,1,0,2,2,0,2,0,0,2,2,0,0, + 2,2,2,3,2,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,3,3, + 0,3,3,1,3,1,2,2,3,2,0,2,2,0,1,2,0,2,2,0,0,0,0, + 3,3,3,3,3,2,2,3,2,2,2,2,2,0,3,2,2,3,0,2,0,2, + 1,3,0,2,0,3,0,1,2,2,0,0,0,2,2,0,0,0,2,0,0,0,0, + 3,3,3,3,3,2,3,3,3,3,3,3,3,2,3,2,2,3,2,3,2,3, + 3,2,1,2,1,2,2,0,2,2,0,2,0,2,2,2,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,2,2,2,3,2,3,1,3,2,0,2,1,3,2,3, + 3,2,2,2,0,2,2,1,0,0,0,2,0,1,2,2,1,0,0,0,2,1,2, + 3,3,3,3,3,3,2,2,3,3,2,2,3,2,3,2,2,2,2,0,2,2, + 0,3,2,0,0,3,3,0,2,1,0,2,0,2,0,0,1,0,0,0,0,0,0, + 2,2,2,3,2,3,3,3,3,3,3,2,3,3,2,3,3,3,2,0,3,0, + 0,0,2,0,2,2,3,1,2,3,0,1,0,1,2,1,0,0,0,0,0,0,0, + 3,3,3,3,3,1,3,2,3,2,3,3,2,0,3,2,2,2,3,3,2,3, + 2,2,2,2,1,2,2,0,2,0,1,0,0,1,2,0,1,0,0,0,0,1,0,
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/src/LangModels/LangSloveneModel.cpp
Added
@@ -0,0 +1,259 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Slovene *********/ + +/** + * Generated by BuildLangModel.py + * On: 2016-09-28 22:06:46.134717 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Iso_8859_2_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 18, 19, 13, 1, 24, 17, 20, 2, 8, 12, 9, 14, 4, 3, /* 4X */ + 11, 28, 5, 6, 7, 16, 10, 27, 25, 26, 15,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 18, 19, 13, 1, 24, 17, 20, 2, 8, 12, 9, 14, 4, 3, /* 6X */ + 11, 28, 5, 6, 7, 16, 10, 27, 25, 26, 15,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 41,SYM, 42,SYM, 43, 44,SYM,SYM, 22, 45, 46, 47,SYM, 23, 48, /* AX */ + SYM, 49,SYM, 50,SYM, 51, 52,SYM,SYM, 22, 53, 54, 55,SYM, 23, 56, /* BX */ + 57, 32, 58, 59, 60, 61, 37, 34, 21, 29, 62, 36, 63, 30, 64, 65, /* CX */ + 66, 67, 68, 31, 35, 69, 70,SYM, 71, 72, 39, 73, 74, 40, 75, 76, /* DX */ + 77, 32, 78, 79, 80, 81, 37, 34, 21, 29, 82, 36, 83, 30, 84, 85, /* EX */ + 86, 87, 88, 31, 35, 89, 90,SYM, 91, 92, 39, 93, 94, 40, 95,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_16_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 18, 19, 13, 1, 24, 17, 20, 2, 8, 12, 9, 14, 4, 3, /* 4X */ + 11, 28, 5, 6, 7, 16, 10, 27, 25, 26, 15,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 18, 19, 13, 1, 24, 17, 20, 2, 8, 12, 9, 14, 4, 3, /* 6X */ + 11, 28, 5, 6, 7, 16, 10, 27, 25, 26, 15,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 96, 97, 98,SYM,SYM, 22,SYM, 22,SYM, 99,SYM,100,SYM,101,102, /* AX */ + SYM,SYM, 21,103, 23,SYM,SYM,SYM, 23, 21,104,SYM,105,106,107,108, /* BX */ + 109, 32,110,111,112, 37,113, 34,114, 29, 33, 36,115, 30,116,117, /* CX */ + 118,119,120, 31, 35,121,122,123,124,125, 39,126,127,128,129,130, /* DX */ + 131, 32,132,133,134, 37,135, 34,136, 29, 33, 36,137, 30,138,139, /* EX */ + 140,141,142, 31, 35,143,144,145,146,147, 39,148,149,150,151,152, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Windows_1250_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 18, 19, 13, 1, 24, 17, 20, 2, 8, 12, 9, 14, 4, 3, /* 4X */ + 11, 28, 5, 6, 7, 16, 10, 27, 25, 26, 15,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 18, 19, 13, 1, 24, 17, 20, 2, 8, 12, 9, 14, 4, 3, /* 6X */ + 11, 28, 5, 6, 7, 16, 10, 27, 25, 26, 15,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM,ILL,SYM,SYM,SYM,SYM,ILL,SYM, 22,SYM,153,154, 23,155, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM, 22,SYM,156,157, 23,158, /* 9X */ + SYM,SYM,SYM,159,SYM,160,SYM,SYM,SYM,SYM,161,SYM,SYM,SYM,SYM,162, /* AX */ + SYM,SYM,SYM,163,SYM,SYM,SYM,SYM,SYM,164,165,SYM,166,SYM,167,168, /* BX */ + 169, 32,170,171,172,173, 37, 34, 21, 29,174, 36,175, 30,176,177, /* CX */ + 178,179,180, 31, 35,181,182,SYM,183,184, 39,185,186, 40,187,188, /* DX */ + 189, 32,190,191,192,193, 37, 34, 21, 29,194, 36,195, 30,196,197, /* EX */ + 198,199,200, 31, 35,201,202,SYM,203,204, 39,205,206, 40,207,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Mac_Centraleurope_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 18, 19, 13, 1, 24, 17, 20, 2, 8, 12, 9, 14, 4, 3, /* 4X */ + 11, 28, 5, 6, 7, 16, 10, 27, 25, 26, 15,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 18, 19, 13, 1, 24, 17, 20, 2, 8, 12, 9, 14, 4, 3, /* 6X */ + 11, 28, 5, 6, 7, 16, 10, 27, 25, 26, 15,SYM,SYM,SYM,SYM,CTR, /* 7X */ + 208,209,210, 29,211,212,213, 32,214, 21,215, 21, 37, 37, 29,216, /* 8X */ + 217,218, 30,219, 38, 38,220, 31,221, 35,222,223, 39,224,225,226, /* 9X */ + SYM,SYM,227,SYM,SYM,SYM,SYM,228,SYM,SYM,SYM,229,SYM,SYM,230,231, /* AX */ + 232,233,SYM,SYM,234,235,SYM,SYM,236,237,238,239,240,241,242,243, /* BX */ + 244,245,SYM,SYM,246,247,SYM,SYM,SYM,SYM,SYM,248,249,249,249,249, /* CX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,249,249,249,249,SYM,SYM,249,249, /* DX */ + 249, 22,SYM,SYM, 22,249,249, 32,249,249, 30, 23, 23,249, 31, 35, /* EX */ + 249,249, 39,249,249,249,249,249, 40, 40,249,249,249,249,249,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Ibm852_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 18, 19, 13, 1, 24, 17, 20, 2, 8, 12, 9, 14, 4, 3, /* 4X */ + 11, 28, 5, 6, 7, 16, 10, 27, 25, 26, 15,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 18, 19, 13, 1, 24, 17, 20, 2, 8, 12, 9, 14, 4, 3, /* 6X */ + 11, 28, 5, 6, 7, 16, 10, 27, 25, 26, 15,SYM,SYM,SYM,SYM,CTR, /* 7X */ + 34,249, 29,249,249,249, 37, 34,249, 36,249,249,249,249,249, 37, /* 8X */ + 29,249,249, 35,249,249,249,249,249,249,249,249,249,249,SYM, 21, /* 9X */ + 32, 30, 31, 39,249,249, 23, 23,249,249,SYM,249, 21,249,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 32,249,249,249,SYM,SYM,SYM,SYM,249,249,SYM, /* BX */ + SYM,SYM,SYM,SYM,SYM,SYM,249,249,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */ + 249,249,249, 36,249,249, 30,249,249,SYM,SYM,SYM,SYM,249,249,SYM, /* DX */ + 31,249, 35,249,249,249, 22, 22,249, 39,249,249, 40, 40,249,SYM, /* EX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,249,249,249,SYM,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 727 + * First 512 sequences: 0.9983524317161332 + * Next 512 sequences (512-1024): 0.0016475682838668457 + * Rest: -3.859759734048396e-17 + * Negative sequences: TODO + */ +static const PRUint8 SloveneLangModel = +{ + 2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,3,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2, + 3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,2,3,2,3,3,3,2,0,0,3,2,3,3,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,0,0,0,3,2,3,3,0, + 3,3,3,3,3,2,3,3,0,0,3,3,3,3,3,2,3,2,3,3,3,2,3,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,3,2,3,3,2,3,2,0, + 3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,2,3,3,3,3,2,2,2,2,0,0, + 3,3,3,3,3,3,3,3,2,3,0,3,3,3,2,2,3,3,3,3,3,2,2,0,0,0,3,2,2, + 3,3,3,3,3,3,3,3,3,3,3,0,2,3,3,2,3,0,2,3,3,0,3,0,2,0,3,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,2,2,3,2,0, + 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,0,3,2,3,3,2,2,2,0,2,2,3,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,2,0,2,0,0,0, + 3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,2,0,0, + 3,3,3,3,3,3,3,2,0,3,3,3,2,2,2,0,3,2,3,2,3,0,0,0,2,2,2,2,0, + 3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,0,3,0,2,2,0,3,3,2,2,0,3,0,0, + 3,3,3,3,3,3,3,3,0,3,2,3,3,3,2,2,3,2,2,3,3,0,0,0,2,2,3,2,2, + 3,3,3,3,3,3,2,3,0,3,3,3,3,2,2,2,3,0,2,0,0,2,0,0,2,0,2,2,0,
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/LangModels/LangSpanishModel.cpp -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/LangModels/LangSpanishModel.cpp
Changed
@@ -198,4 +198,4 @@ (float)0.9970385677528184, PR_TRUE, "WINDOWS-1252" -}; \ No newline at end of file +};
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/src/LangModels/LangSwedishModel.cpp
Added
@@ -0,0 +1,261 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Swedish *********/ + +/** + * Generated by BuildLangModel.py + * On: 2016-09-28 22:29:21.480940 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Windows_1252_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 22, 20, 9, 1, 14, 12, 18, 6, 23, 10, 7, 11, 3, 8, /* 4X */ + 15, 30, 2, 5, 4, 16, 13, 26, 25, 24, 27,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 22, 20, 9, 1, 14, 12, 18, 6, 23, 10, 7, 11, 3, 8, /* 6X */ + 15, 30, 2, 5, 4, 16, 13, 26, 25, 24, 27,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM, 34,SYM,SYM,SYM,SYM,SYM,SYM, 48,SYM, 49,ILL, 50,ILL, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 51,SYM, 52,ILL, 53, 54, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 55,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 56, 44, 57, 58, 17, 19, 38, 40, 32, 28, 45, 59, 60, 61, 47, 62, /* CX */ + 63, 64, 65, 66, 35, 67, 21,SYM, 37, 68, 69, 70, 31, 71, 72, 73, /* DX */ + 74, 44, 75, 76, 17, 19, 38, 40, 32, 28, 45, 77, 78, 79, 47, 80, /* EX */ + 81, 82, 83, 84, 35, 85, 21,SYM, 37, 86, 87, 88, 31, 89, 90, 91, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_9_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 22, 20, 9, 1, 14, 12, 18, 6, 23, 10, 7, 11, 3, 8, /* 4X */ + 15, 30, 2, 5, 4, 16, 13, 26, 25, 24, 27,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 22, 20, 9, 1, 14, 12, 18, 6, 23, 10, 7, 11, 3, 8, /* 6X */ + 15, 30, 2, 5, 4, 16, 13, 26, 25, 24, 27,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 92,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 93, 44, 94, 95, 17, 19, 38, 40, 32, 28, 45, 96, 97, 98, 47, 99, /* CX */ + 100,101,102,103, 35,104, 21,SYM, 37,105,106,107, 31,108,109,110, /* DX */ + 111, 44,112,113, 17, 19, 38, 40, 32, 28, 45,114,115,116, 47,117, /* EX */ + 118,119,120,121, 35,122, 21,SYM, 37,123,124,125, 31, 42,126,127, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_1_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 22, 20, 9, 1, 14, 12, 18, 6, 23, 10, 7, 11, 3, 8, /* 4X */ + 15, 30, 2, 5, 4, 16, 13, 26, 25, 24, 27,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 22, 20, 9, 1, 14, 12, 18, 6, 23, 10, 7, 11, 3, 8, /* 6X */ + 15, 30, 2, 5, 4, 16, 13, 26, 25, 24, 27,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM,128,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 129, 44,130,131, 17, 19, 38, 40, 32, 28, 45,132,133,134, 47,135, /* CX */ + 136,137,138,139, 35,140, 21,SYM, 37,141,142,143, 31,144,145,146, /* DX */ + 147, 44,148,149, 17, 19, 38, 40, 32, 28, 45,150,151,152, 47,153, /* EX */ + 154,155,156,157, 35,158, 21,SYM, 37,159,160,161, 31,162,163,164, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_4_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 22, 20, 9, 1, 14, 12, 18, 6, 23, 10, 7, 11, 3, 8, /* 4X */ + 15, 30, 2, 5, 4, 16, 13, 26, 25, 24, 27,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 22, 20, 9, 1, 14, 12, 18, 6, 23, 10, 7, 11, 3, 8, /* 6X */ + 15, 30, 2, 5, 4, 16, 13, 26, 25, 24, 27,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,165,166,167,SYM,168,169,SYM,SYM,170,171,172,173,SYM,174,SYM, /* AX */ + SYM,175,SYM,176,SYM,177,178,SYM,SYM,179,180,181,182, 43,183, 43, /* BX */ + 29, 44,184,185, 17, 19, 38,186,187, 28,188,189, 39,190, 47, 41, /* CX */ + 191,192, 33,193, 35,194, 21,SYM, 37, 36,195,196, 31,197, 46,198, /* DX */ + 29, 44,199,200, 17, 19, 38,201,202, 28,203,204, 39,205, 47, 41, /* EX */ + 206,207, 33,208, 35,209, 21,SYM, 37, 36,210,211, 31,212, 46,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_15_CharToOrderMap = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 22, 20, 9, 1, 14, 12, 18, 6, 23, 10, 7, 11, 3, 8, /* 4X */ + 15, 30, 2, 5, 4, 16, 13, 26, 25, 24, 27,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 22, 20, 9, 1, 14, 12, 18, 6, 23, 10, 7, 11, 3, 8, /* 6X */ + 15, 30, 2, 5, 4, 16, 13, 26, 25, 24, 27,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,213,SYM,214,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,215,216,SYM,SYM,217,SYM,SYM,SYM,218,219,220,SYM, /* BX */ + 221, 44,222,223, 17, 19, 38, 40, 32, 28, 45,224,225,226, 47,227, /* CX */ + 228,229,230,231, 35,232, 21,SYM, 37,233,234,235, 31,236,237,238, /* DX */ + 239, 44,240,241, 17, 19, 38, 40, 32, 28, 45,242,243,244, 47,245, /* EX */ + 246,247,248,249, 35,249, 21,SYM, 37,249,249,249, 31,249,249,249, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 748 + * First 512 sequences: 0.997323508584682 + * Next 512 sequences (512-1024): 0.0026764914153179875 + * Rest: 1.7780915628762273e-17 + * Negative sequences: TODO + */ +static const PRUint8 SwedishLangModel = +{ + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,0,3,2,3,3,3,3,3,2,0,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,3,2,3,3,3,3,3,3,0,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,0,2,2,2,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,2,2,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,2,2,3,0,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,0,2,2,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,2,3,3,2,3,3,2,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,3,3,3,0,2,0,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,0,2,0,2,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,0,2,0,3,3,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,3,3,3,3,0,2,2,0,2,0, + 3,3,3,3,3,3,3,3,3,3,3,2,2,2,3,2,3,3,3,3,0,2,3,2,0,0,0,2,0,0,0, + 3,3,3,2,3,2,3,3,3,2,0,2,2,2,3,2,3,3,0,3,2,3,0,3,3,0,0,0,2,0,0, + 3,3,3,3,3,3,3,3,3,2,3,2,3,3,3,3,3,3,3,3,2,2,2,2,3,2,0,2,3,2,0, + 3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,2,0,2,0,3,2,3,2,0,3,0,0,0,2,0, + 2,2,3,3,3,3,0,3,0,3,3,3,3,3,3,3,2,2,0,0,3,0,3,0,0,3,0,0,0,0,0, + 3,3,3,3,3,2,3,2,3,2,2,2,2,0,0,0,3,3,2,3,2,3,2,3,3,0,0,3,0,2,0, + 2,3,3,3,3,3,2,3,0,3,3,3,3,3,2,0,0,0,2,0,0,2,3,0,0,0,0,0,0,0,0, + 3,3,3,3,3,2,3,3,3,2,3,2,2,2,2,0,3,0,3,0,3,2,2,0,3,0,0,2,2,0,2,
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/LangModels/LangThaiModel.cpp -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/LangModels/LangThaiModel.cpp
Changed
@@ -262,4 +262,4 @@ (float)0.8815720594354438, PR_FALSE, "ISO-8859-11" -}; \ No newline at end of file +};
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/LangModels/LangTurkishModel.cpp -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/LangModels/LangTurkishModel.cpp
Changed
@@ -170,4 +170,4 @@ (float)0.991865243864388, PR_FALSE, "ISO-8859-9" -}; \ No newline at end of file +};
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/LangModels/LangVietnameseModel.cpp -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/LangModels/LangVietnameseModel.cpp
Changed
@@ -244,4 +244,4 @@ (float)0.9321889118082535, PR_FALSE, "VISCII" -}; \ No newline at end of file +};
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/nsBig5Prober.cpp -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/nsBig5Prober.cpp
Changed
@@ -46,7 +46,7 @@ nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen) { - nsSMState codingState; + PRUint32 codingState; for (PRUint32 i = 0; i < aLen; i++) {
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/nsCharSetProber.cpp -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/nsCharSetProber.cpp
Changed
@@ -74,7 +74,7 @@ if (meetMSB && curPtr > prevPtr) while (prevPtr < curPtr) *newptr++ = *prevPtr++; - newLen = newptr - *newBuf; + newLen = (PRUint32) (newptr - *newBuf); return PR_TRUE; } @@ -119,7 +119,7 @@ while (prevPtr < curPtr) *newptr++ = *prevPtr++; - newLen = newptr - *newBuf; + newLen = (PRUint32) (newptr - *newBuf); return PR_TRUE; }
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/nsCodingStateMachine.h -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/nsCodingStateMachine.h
Changed
@@ -39,11 +39,12 @@ #include "nsPkgInt.h" -typedef enum { - eStart = 0, - eError = 1, - eItsMe = 2 -} nsSMState; +/* Apart from these 3 generic states, machine states are specific to + * each charset prober. + */ +#define eStart 0 +#define eError 1 +#define eItsMe 2 #define GETCLASS(c) GETFROMPCK(((unsigned char)(c)), mModel->classTable) @@ -60,7 +61,7 @@ class nsCodingStateMachine { public: nsCodingStateMachine(const SMModel* sm) : mModel(sm) { mCurrentState = eStart; } - nsSMState NextState(char c){ + PRUint32 NextState(char c){ //for each byte we get its class , if it is first byte, we also get byte length PRUint32 byteCls = GETCLASS(c); if (mCurrentState == eStart) @@ -69,8 +70,8 @@ mCurrentCharLen = mModel->charLenTablebyteCls; } //from byte's class and stateTable, we get its next state - mCurrentState=(nsSMState)GETFROMPCK(mCurrentState*(mModel->classFactor)+byteCls, - mModel->stateTable); + mCurrentState = GETFROMPCK(mCurrentState * mModel->classFactor + byteCls, + mModel->stateTable); mCurrentBytePos++; return mCurrentState; } @@ -79,7 +80,7 @@ const char * GetCodingStateMachine() {return mModel->name;} protected: - nsSMState mCurrentState; + PRUint32 mCurrentState; PRUint32 mCurrentCharLen; PRUint32 mCurrentBytePos;
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/nsEUCJPProber.cpp -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/nsEUCJPProber.cpp
Changed
@@ -52,7 +52,7 @@ nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen) { - nsSMState codingState; + PRUint32 codingState; for (PRUint32 i = 0; i < aLen; i++) {
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/nsEUCKRProber.cpp -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/nsEUCKRProber.cpp
Changed
@@ -47,7 +47,7 @@ nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen) { - nsSMState codingState; + PRUint32 codingState; for (PRUint32 i = 0; i < aLen; i++) {
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/nsEUCKRProber.h -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/nsEUCKRProber.h
Changed
@@ -51,7 +51,12 @@ } virtual ~nsEUCKRProber(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); - const char* GetCharSetName() {return "EUC-KR";} + /* "Unified Hangul Code", also called "CP949" or "Windows-949" is a + * superset of EUC-KR. Though not fully ok to return UHC here (a + * separate prober would be better), it is acceptable, since many + * Korean documents are actually created with this character set. + */ + const char* GetCharSetName() {return "UHC";} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void);
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/nsEUCTWProber.cpp -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/nsEUCTWProber.cpp
Changed
@@ -47,7 +47,7 @@ nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen) { - nsSMState codingState; + PRUint32 codingState; for (PRUint32 i = 0; i < aLen; i++) {
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/nsEscCharsetProber.cpp -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/nsEscCharsetProber.cpp
Changed
@@ -75,7 +75,7 @@ nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen) { - nsSMState codingState; + PRUint32 codingState; PRInt32 j; PRUint32 i;
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/nsEscSM.cpp -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/nsEscSM.cpp
Changed
@@ -197,7 +197,11 @@ PCK4BITS(eError,eError,eError,eError,eItsMe,eError,eStart,eStart) //40-47 }; -static const PRUint32 ISO2022JPCharLenTable = {0, 0, 0, 0, 0, 0, 0, 0}; +/* XXX: I needed to complete the 2 last classes for this CharLenTable + * but I did it a bit randomly. Cf. bug 101030. + * Let's check this piece of code again later when I understand it + * better. */ +static const PRUint32 ISO2022JPCharLenTable = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; const SMModel ISO2022JPSMModel = { {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022JP_cls },
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/nsGB2312Prober.cpp -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/nsGB2312Prober.cpp
Changed
@@ -52,7 +52,7 @@ nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen) { - nsSMState codingState; + PRUint32 codingState; for (PRUint32 i = 0; i < aLen; i++) {
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/nsSBCSGroupProber.cpp -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/nsSBCSGroupProber.cpp
Changed
@@ -110,6 +110,92 @@ mProbers32 = new nsSingleByteCharSetProber(&Iso_8859_15DanishModel); mProbers33 = new nsSingleByteCharSetProber(&Iso_8859_1DanishModel); mProbers34 = new nsSingleByteCharSetProber(&Windows_1252DanishModel); + mProbers35 = new nsSingleByteCharSetProber(&Ibm865DanishModel); + + mProbers36 = new nsSingleByteCharSetProber(&Iso_8859_13LithuanianModel); + mProbers37 = new nsSingleByteCharSetProber(&Iso_8859_10LithuanianModel); + mProbers38 = new nsSingleByteCharSetProber(&Iso_8859_4LithuanianModel); + + mProbers39 = new nsSingleByteCharSetProber(&Iso_8859_13LatvianModel); + mProbers40 = new nsSingleByteCharSetProber(&Iso_8859_10LatvianModel); + mProbers41 = new nsSingleByteCharSetProber(&Iso_8859_4LatvianModel); + + mProbers42 = new nsSingleByteCharSetProber(&Iso_8859_1PortugueseModel); + mProbers43 = new nsSingleByteCharSetProber(&Iso_8859_9PortugueseModel); + mProbers44 = new nsSingleByteCharSetProber(&Iso_8859_15PortugueseModel); + mProbers45 = new nsSingleByteCharSetProber(&Windows_1252PortugueseModel); + + mProbers46 = new nsSingleByteCharSetProber(&Iso_8859_3MalteseModel); + + mProbers47 = new nsSingleByteCharSetProber(&Windows_1250CzechModel); + mProbers48 = new nsSingleByteCharSetProber(&Iso_8859_2CzechModel); + mProbers49 = new nsSingleByteCharSetProber(&Mac_CentraleuropeCzechModel); + mProbers50 = new nsSingleByteCharSetProber(&Ibm852CzechModel); + + mProbers51 = new nsSingleByteCharSetProber(&Windows_1250SlovakModel); + mProbers52 = new nsSingleByteCharSetProber(&Iso_8859_2SlovakModel); + mProbers53 = new nsSingleByteCharSetProber(&Mac_CentraleuropeSlovakModel); + mProbers54 = new nsSingleByteCharSetProber(&Ibm852SlovakModel); + + mProbers55 = new nsSingleByteCharSetProber(&Windows_1250PolishModel); + mProbers56 = new nsSingleByteCharSetProber(&Iso_8859_2PolishModel); + mProbers57 = new nsSingleByteCharSetProber(&Iso_8859_13PolishModel); + mProbers58 = new nsSingleByteCharSetProber(&Iso_8859_16PolishModel); + mProbers59 = new nsSingleByteCharSetProber(&Mac_CentraleuropePolishModel); + mProbers60 = new nsSingleByteCharSetProber(&Ibm852PolishModel); + + mProbers61 = new nsSingleByteCharSetProber(&Iso_8859_1FinnishModel); + mProbers62 = new nsSingleByteCharSetProber(&Iso_8859_4FinnishModel); + mProbers63 = new nsSingleByteCharSetProber(&Iso_8859_9FinnishModel); + mProbers64 = new nsSingleByteCharSetProber(&Iso_8859_13FinnishModel); + mProbers65 = new nsSingleByteCharSetProber(&Iso_8859_15FinnishModel); + mProbers66 = new nsSingleByteCharSetProber(&Windows_1252FinnishModel); + + mProbers67 = new nsSingleByteCharSetProber(&Iso_8859_1ItalianModel); + mProbers68 = new nsSingleByteCharSetProber(&Iso_8859_3ItalianModel); + mProbers69 = new nsSingleByteCharSetProber(&Iso_8859_9ItalianModel); + mProbers70 = new nsSingleByteCharSetProber(&Iso_8859_15ItalianModel); + mProbers71 = new nsSingleByteCharSetProber(&Windows_1252ItalianModel); + + mProbers72 = new nsSingleByteCharSetProber(&Windows_1250CroatianModel); + mProbers73 = new nsSingleByteCharSetProber(&Iso_8859_2CroatianModel); + mProbers74 = new nsSingleByteCharSetProber(&Iso_8859_13CroatianModel); + mProbers75 = new nsSingleByteCharSetProber(&Iso_8859_16CroatianModel); + mProbers76 = new nsSingleByteCharSetProber(&Mac_CentraleuropeCroatianModel); + mProbers77 = new nsSingleByteCharSetProber(&Ibm852CroatianModel); + + mProbers78 = new nsSingleByteCharSetProber(&Windows_1252EstonianModel); + mProbers79 = new nsSingleByteCharSetProber(&Windows_1257EstonianModel); + mProbers80 = new nsSingleByteCharSetProber(&Iso_8859_4EstonianModel); + mProbers81 = new nsSingleByteCharSetProber(&Iso_8859_13EstonianModel); + mProbers82 = new nsSingleByteCharSetProber(&Iso_8859_15EstonianModel); + + mProbers83 = new nsSingleByteCharSetProber(&Iso_8859_1IrishModel); + mProbers84 = new nsSingleByteCharSetProber(&Iso_8859_9IrishModel); + mProbers85 = new nsSingleByteCharSetProber(&Iso_8859_15IrishModel); + mProbers86 = new nsSingleByteCharSetProber(&Windows_1252IrishModel); + + mProbers87 = new nsSingleByteCharSetProber(&Windows_1250RomanianModel); + mProbers88 = new nsSingleByteCharSetProber(&Iso_8859_2RomanianModel); + mProbers89 = new nsSingleByteCharSetProber(&Iso_8859_16RomanianModel); + mProbers90 = new nsSingleByteCharSetProber(&Ibm852RomanianModel); + + mProbers91 = new nsSingleByteCharSetProber(&Windows_1250SloveneModel); + mProbers92 = new nsSingleByteCharSetProber(&Iso_8859_2SloveneModel); + mProbers93 = new nsSingleByteCharSetProber(&Iso_8859_16SloveneModel); + mProbers94 = new nsSingleByteCharSetProber(&Mac_CentraleuropeSloveneModel); + mProbers95 = new nsSingleByteCharSetProber(&Ibm852SloveneModel); + + mProbers96 = new nsSingleByteCharSetProber(&Iso_8859_1SwedishModel); + mProbers97 = new nsSingleByteCharSetProber(&Iso_8859_4SwedishModel); + mProbers98 = new nsSingleByteCharSetProber(&Iso_8859_9SwedishModel); + mProbers99 = new nsSingleByteCharSetProber(&Iso_8859_15SwedishModel); + mProbers100 = new nsSingleByteCharSetProber(&Windows_1252SwedishModel); + + mProbers101 = new nsSingleByteCharSetProber(&Iso_8859_15NorwegianModel); + mProbers102 = new nsSingleByteCharSetProber(&Iso_8859_1NorwegianModel); + mProbers103 = new nsSingleByteCharSetProber(&Windows_1252NorwegianModel); + mProbers104 = new nsSingleByteCharSetProber(&Ibm865NorwegianModel); Reset(); }
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/nsSBCSGroupProber.h -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/nsSBCSGroupProber.h
Changed
@@ -40,7 +40,7 @@ #define nsSBCSGroupProber_h__ -#define NUM_OF_SBCS_PROBERS 35 +#define NUM_OF_SBCS_PROBERS 105 class nsCharSetProber; class nsSBCSGroupProber: public nsCharSetProber {
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/nsSBCharSetProber.cpp -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/nsSBCharSetProber.cpp
Changed
@@ -63,7 +63,7 @@ } if (order < mModel->freqCharCount) { - mFreqChar++; + mFreqChar++; if (mLastOrder < mModel->freqCharCount) {
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/nsSBCharSetProber.h -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/nsSBCharSetProber.h
Changed
@@ -171,6 +171,92 @@ extern const SequenceModel Iso_8859_15DanishModel; extern const SequenceModel Iso_8859_1DanishModel; extern const SequenceModel Windows_1252DanishModel; +extern const SequenceModel Ibm865DanishModel; + +extern const SequenceModel Iso_8859_13LithuanianModel; +extern const SequenceModel Iso_8859_10LithuanianModel; +extern const SequenceModel Iso_8859_4LithuanianModel; + +extern const SequenceModel Iso_8859_13LatvianModel; +extern const SequenceModel Iso_8859_10LatvianModel; +extern const SequenceModel Iso_8859_4LatvianModel; + +extern const SequenceModel Iso_8859_1PortugueseModel; +extern const SequenceModel Iso_8859_9PortugueseModel; +extern const SequenceModel Iso_8859_15PortugueseModel; +extern const SequenceModel Windows_1252PortugueseModel; + +extern const SequenceModel Iso_8859_3MalteseModel; + +extern const SequenceModel Windows_1250CzechModel; +extern const SequenceModel Iso_8859_2CzechModel; +extern const SequenceModel Ibm852CzechModel; +extern const SequenceModel Mac_CentraleuropeCzechModel; + +extern const SequenceModel Windows_1250SlovakModel; +extern const SequenceModel Iso_8859_2SlovakModel; +extern const SequenceModel Ibm852SlovakModel; +extern const SequenceModel Mac_CentraleuropeSlovakModel; + +extern const SequenceModel Windows_1250PolishModel; +extern const SequenceModel Iso_8859_2PolishModel; +extern const SequenceModel Iso_8859_13PolishModel; +extern const SequenceModel Iso_8859_16PolishModel; +extern const SequenceModel Ibm852PolishModel; +extern const SequenceModel Mac_CentraleuropePolishModel; + +extern const SequenceModel Iso_8859_1FinnishModel; +extern const SequenceModel Iso_8859_4FinnishModel; +extern const SequenceModel Iso_8859_9FinnishModel; +extern const SequenceModel Iso_8859_13FinnishModel; +extern const SequenceModel Iso_8859_15FinnishModel; +extern const SequenceModel Windows_1252FinnishModel; + +extern const SequenceModel Iso_8859_1ItalianModel; +extern const SequenceModel Iso_8859_3ItalianModel; +extern const SequenceModel Iso_8859_9ItalianModel; +extern const SequenceModel Iso_8859_15ItalianModel; +extern const SequenceModel Windows_1252ItalianModel; + +extern const SequenceModel Windows_1250CroatianModel; +extern const SequenceModel Iso_8859_2CroatianModel; +extern const SequenceModel Iso_8859_13CroatianModel; +extern const SequenceModel Iso_8859_16CroatianModel; +extern const SequenceModel Ibm852CroatianModel; +extern const SequenceModel Mac_CentraleuropeCroatianModel; + +extern const SequenceModel Windows_1252EstonianModel; +extern const SequenceModel Windows_1257EstonianModel; +extern const SequenceModel Iso_8859_4EstonianModel; +extern const SequenceModel Iso_8859_13EstonianModel; +extern const SequenceModel Iso_8859_15EstonianModel; + +extern const SequenceModel Iso_8859_15IrishModel; +extern const SequenceModel Iso_8859_9IrishModel; +extern const SequenceModel Iso_8859_1IrishModel; +extern const SequenceModel Windows_1252IrishModel; + +extern const SequenceModel Windows_1250RomanianModel; +extern const SequenceModel Iso_8859_2RomanianModel; +extern const SequenceModel Iso_8859_16RomanianModel; +extern const SequenceModel Ibm852RomanianModel; + +extern const SequenceModel Windows_1250SloveneModel; +extern const SequenceModel Iso_8859_2SloveneModel; +extern const SequenceModel Iso_8859_16SloveneModel; +extern const SequenceModel Ibm852SloveneModel; +extern const SequenceModel Mac_CentraleuropeSloveneModel; + +extern const SequenceModel Iso_8859_1SwedishModel; +extern const SequenceModel Iso_8859_4SwedishModel; +extern const SequenceModel Iso_8859_9SwedishModel; +extern const SequenceModel Iso_8859_15SwedishModel; +extern const SequenceModel Windows_1252SwedishModel; + +extern const SequenceModel Iso_8859_15NorwegianModel; +extern const SequenceModel Iso_8859_1NorwegianModel; +extern const SequenceModel Windows_1252NorwegianModel; +extern const SequenceModel Ibm865NorwegianModel; #endif /* nsSingleByteCharSetProber_h__ */
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/nsSJISProber.cpp -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/nsSJISProber.cpp
Changed
@@ -52,7 +52,7 @@ nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen) { - nsSMState codingState; + PRUint32 codingState; for (PRUint32 i = 0; i < aLen; i++) {
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/nsUTF8Prober.cpp -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/nsUTF8Prober.cpp
Changed
@@ -46,7 +46,7 @@ nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen) { - nsSMState codingState; + PRUint32 codingState; for (PRUint32 i = 0; i < aLen; i++) {
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/nsUniversalDetector.cpp -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/nsUniversalDetector.cpp
Changed
@@ -113,6 +113,7 @@ { mStart = PR_FALSE; if (aLen > 2) + { switch (aBuf0) { case '\xEF': @@ -153,12 +154,13 @@ } break; } + } - if (mDetectedCharset) - { + if (mDetectedCharset) + { mDone = PR_TRUE; return NS_OK; - } + } } PRUint32 i; @@ -240,16 +242,6 @@ mDone = PR_TRUE; mDetectedCharset = mEscCharSetProber->GetCharSetName(); } - else if (mNbspFound) - { - mDetectedCharset = "ISO-8859-1"; - } - else - { - /* ASCII with the ESC character (or the sequence "~{") is still - * ASCII until proven otherwise. */ - mDetectedCharset = "ASCII"; - } break; case eHighbyte: for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) @@ -268,17 +260,6 @@ break; default: - if (mNbspFound) - { - /* ISO-8859-1 is a good result candidate for ASCII + NBSP. - * (though it could have been any ISO-8859 encoding). */ - mDetectedCharset = "ISO-8859-1"; - } - else - { - /* Pure ASCII */ - mDetectedCharset = "ASCII"; - } break; } return NS_OK; @@ -295,6 +276,29 @@ return; } + if (! mDetectedCharset) + { + switch (mInputState) + { + case eEscAscii: + case ePureAscii: + if (mNbspFound) + { + /* ISO-8859-1 is a good result candidate for ASCII + NBSP. + * (though it could have been any ISO-8859 encoding). */ + mDetectedCharset = "ISO-8859-1"; + } + else + { + /* ASCII with the ESC character (or the sequence "~{") is still + * ASCII until proven otherwise. */ + mDetectedCharset = "ASCII"; + } + default: + break; + } + } + if (mDetectedCharset) { mDone = PR_TRUE;
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/tools/CMakeLists.txt -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/tools/CMakeLists.txt
Changed
@@ -3,6 +3,18 @@ uchardet.cpp ) +include(CheckSymbolExists) + +check_symbol_exists(getopt_long "getopt.h" HAVE_GETOPT_LONG) + +# On Windows with MSVC, `getopt_long` is not available by default. +# But some third-party libraries can be used. For example, in `vcpkg`, +# we can find a port named `getopt-win32`. +if (NOT HAVE_GETOPT_LONG) + find_path(GETOPT_INCLUDE_DIR NAMES getopt.h) + find_library(GETOPT_LIBRARY NAMES getopt) +endif (NOT HAVE_GETOPT_LONG) + set(UCHARDET_BINARY uchardet) add_executable( @@ -10,6 +22,11 @@ ${UCHARDET_SOURCES} ) +if (GETOPT_INCLUDE_DIR AND GETOPT_LIBRARY) + target_include_directories(${UCHARDET_BINARY} PRIVATE ${GETOPT_INCLUDE_DIR}) + target_link_libraries(${UCHARDET_BINARY} PRIVATE ${GETOPT_LIBRARY}) +endif (GETOPT_INCLUDE_DIR AND GETOPT_LIBRARY) + target_link_libraries( ${UCHARDET_BINARY} ${UCHARDET_LIBRARY} @@ -18,6 +35,8 @@ install( TARGETS ${UCHARDET_BINARY} + EXPORT + UchardetTargets RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} )
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/tools/uchardet.cpp -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/tools/uchardet.cpp
Changed
@@ -35,27 +35,27 @@ * * ***** END LICENSE BLOCK ***** */ #include "../uchardet.h" -#include <cstdio> -#include <cstring> -#include <cstdlib> #include <getopt.h> -#include <iostream> #include <stdio.h> +#include <stdlib.h> +#include <string.h> #ifndef VERSION #define VERSION "Unknown" #endif #define BUFFER_SIZE 65536 -char bufferBUFFER_SIZE; +static char bufferBUFFER_SIZE; -void detect(FILE * fp) +static void detect(FILE * fp) { uchardet_t handle = uchardet_new(); - while (!feof(fp)) + while (1) { size_t len = fread(buffer, 1, BUFFER_SIZE, fp); + if (len == 0) + break; int retval = uchardet_handle_data(handle, buffer, len); if (retval != 0) { @@ -74,18 +74,18 @@ uchardet_delete(handle); } -void show_version() +static void show_version() { printf("\n"); printf("uchardet Command Line Tool\n"); printf("Version %s\n", VERSION); printf("\n"); printf("Authors: %s\n", "BYVoid, Jehan"); - printf("Bug Report: %s\n", "https://bugs.freedesktop.org/enter_bug.cgi?product=uchardet"); + printf("Bug Report: %s\n", "https://gitlab.freedesktop.org/uchardet/uchardet/-/issues"); printf("\n"); } -void show_usage() +static void show_usage() { show_version(); printf("Usage:\n"); @@ -105,6 +105,7 @@ { "help", no_argument, NULL, 'h' }, { 0, 0, 0, 0 }, }; + bool end_options = false; static int oc; while((oc = getopt_long(argc, argv, "vh", longopts, NULL)) != -1) @@ -125,7 +126,8 @@ FILE * f = stdin; int error_seen = 0; - if (argc < 2) + if (argc < 2 || + (argc == 2 && strcmp(argv1, "--") == 0)) { // No file arg, use stdin by default detect(f); @@ -133,6 +135,13 @@ for (int i = 1; i < argc; i++) { const char *filename = argvi; + + if (! end_options && strcmp(filename, "--") == 0) + { + end_options = true; + continue; + } + f = fopen(filename, "r"); if (f == NULL) {
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/uchardet.cpp -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/uchardet.cpp
Changed
@@ -91,7 +91,11 @@ int uchardet_handle_data(uchardet_t ud, const char * data, size_t len) { - nsresult ret = reinterpret_cast<HandleUniversalDetector*>(ud)->HandleData(data, (PRUint32)len); + nsresult ret = NS_OK; + + if (len > 0) + ret = reinterpret_cast<HandleUniversalDetector*>(ud)->HandleData(data, (PRUint32)len); + return (ret != NS_OK); }
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/src/uchardet.h -> _service:tar_scm:uchardet-0.0.8.tar.xz/src/uchardet.h
Changed
@@ -20,6 +20,7 @@ * * Contributor(s): * BYVoid <byvoid.kcp@gmail.com> + * Jehan <jehan at girinstud.io> * * Alternatively, the contents of this file may be used under the terms of * either the GNU General Public License Version 2 or later (the "GPL"), or @@ -43,47 +44,65 @@ #include <stddef.h> +#if defined(UCHARDET_SHARED) && (defined(_WIN32) || defined(__CYGWIN__)) +#ifdef BUILDING_UCHARDET +#define UCHARDET_INTERFACE __declspec(dllexport) +#else +#define UCHARDET_INTERFACE __declspec(dllimport) +#endif +#else +#define UCHARDET_INTERFACE +#endif + + +/** + * A handle for a uchardet encoding detector. + */ typedef struct uchardet * uchardet_t; /** * Create an encoding detector. - * @return a handle of a instance of uchardet + * @return an instance of uchardet_t. */ -uchardet_t uchardet_new(void); +UCHARDET_INTERFACE uchardet_t uchardet_new(void); /** * Delete an encoding detector. - * @param ud in handle of a instance of uchardet + * @param ud in the uchardet_t handle to delete. */ -void uchardet_delete(uchardet_t ud); +UCHARDET_INTERFACE void uchardet_delete(uchardet_t ud); /** * Feed data to an encoding detector. - * @param ud in handle of a instance of uchardet + * The detector is able to shortcut processing when it reaches certainty + * for an encoding, so you should not worry about limiting input data. + * As far as you should be concerned: the more the better. + * + * @param ud in handle of an instance of uchardet * @param data in data * @param len in number of byte of data * @return non-zero number on failure. */ -int uchardet_handle_data(uchardet_t ud, const char * data, size_t len); +UCHARDET_INTERFACE int uchardet_handle_data(uchardet_t ud, const char * data, size_t len); /** - * Notify an end of data to an encoding detctor. - * @param ud in handle of a instance of uchardet + * Notify an end of data to an encoding detector. + * @param ud in handle of an instance of uchardet */ -void uchardet_data_end(uchardet_t ud); +UCHARDET_INTERFACE void uchardet_data_end(uchardet_t ud); /** * Reset an encoding detector. - * @param ud in handle of a instance of uchardet + * @param ud in handle of an instance of uchardet */ -void uchardet_reset(uchardet_t ud); +UCHARDET_INTERFACE void uchardet_reset(uchardet_t ud); /** * Get an iconv-compatible name of the encoding that was detected. - * @param ud in handle of a instance of uchardet + * @param ud in handle of an instance of uchardet * @return name of charset on success and "" on failure. */ -const char * uchardet_get_charset(uchardet_t ud); +UCHARDET_INTERFACE const char * uchardet_get_charset(uchardet_t ud); #ifdef __cplusplus }
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/cs
Added
+(directory)
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/cs/ibm852.txt
Added
@@ -0,0 +1,4 @@ +Ledå ek ý¡n¡ (Alcedo atthis) je pr mØrnØ 16,5 cm velkì pt k z eledi +ledå kovitìch (Alcedinidae). Je velmi vìraznØ zbarvenì s oran§ovou spodinou a +modrìm hýbetem, ký¡dly a temenem. Vìraznìm znakem je tak jeho n padnØ dlouhì +zaçpiatØlì zob k. Pro sv kr sn zbarven¡ je nazìv n Ltaj¡c¡ drahokam.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/cs/iso-8859-2.txt
Added
@@ -0,0 +1,4 @@ +Ledòáèek øíèní (Alcedo atthis) je prùmìrnì 16,5 cm velký pták z èeledi +ledòáèkovitých (Alcedinidae). Je velmi výraznì zbarvený s oran¾ovou spodinou a +modrým høbetem, køídly a temenem. Výrazným znakem je také jeho nápadnì dlouhý +za¹pièatìlý zobák. Pro své krásné zbarvení je nazýván Létající drahokam.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/cs/mac-centraleurope.txt
Added
@@ -0,0 +1,4 @@ +LedËek Þn (Alcedo atthis) je prómrn 16,5 cm velkù ptk z eledi +ledËkovitùch (Alcedinidae). Je velmi vùrazn zbarvenù s oranìovou spodinou a +modrùm hÞbetem, kÞdly a temenem. Vùraznùm znakem je tak jeho npadn dlouhù +zaäpiatlù zobk. Pro sv krsn zbarven je nazùvn Ltajc drahokam.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/cs/utf-8.txt
Added
@@ -0,0 +1,4 @@ +Ledňáček říční (Alcedo atthis) je průměrně 16,5 cm velký pták z čeledi +ledňáčkovitých (Alcedinidae). Je velmi výrazně zbarvený s oranžovou spodinou a +modrým hřbetem, křídly a temenem. Výrazným znakem je také jeho nápadně dlouhý +zašpičatělý zobák. Pro své krásné zbarvení je nazýván Létající drahokam.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/cs/windows-1250.txt
Added
@@ -0,0 +1,4 @@ +Ledòáèek øíèní (Alcedo atthis) je prùmìrnì 16,5 cm velký pták z èeledi +ledòáèkovitých (Alcedinidae). Je velmi výraznì zbarvený s oranovou spodinou a +modrým høbetem, køídly a temenem. Výrazným znakem je také jeho nápadnì dlouhý +zapièatìlý zobák. Pro své krásné zbarvení je nazýván Létající drahokam.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/da/ibm865.txt
Added
@@ -0,0 +1,5 @@ +Jimi Hendrix (1942-1970) var en amerikansk rockguitarist, sanger og sangskriver. + +Han begyndte at spille guitar, da han var femten r, og efter at have spillet med blandt andet Little Richard dannede han Jimi Hendrix Experience i slutningen af 1966. Denne gruppe fik snart hits med sange som "Hey Joe" og "Purple Haze", og med det tredje album, Electric Ladyland fra 1968, fik gruppen sit store gennembrud. Med flere markante optrdener p tidens store festivaler, heriblandt Woodstock, opnede han legendarisk status i rockmusikken, allerede mens han var i live. + +Hendrix brugte sin elektriske guitar som elektronisk lydkilde og eksperimenterede med feedback og distortion med udgangspunkt i traditionel rock'n'roll og blues. Hans misbrug af alkohol og narkotika frte imidlertid til, at han delagde sig selv, og han dde som blot 27-rig efter indtagelse af sovepiller.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/et
Added
+(directory)
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/et/iso-8859-13.txt
Added
@@ -0,0 +1,6 @@ +¥Anton Pavlovitð Tðehhov´ oli vene näite- ja novellikirjanik ning praktiseeriv arst. + +Tðehhov on eelkõige tuntud oma novellide poolest. Tema jutustuste tavaliseks +tegevuspaigaks olid vene väikeasulad ja need käsitlesid hingeüksildust, raisatud +õnne jms. Tuntud on ka tema psühholoogilised näidendid, kus valitseb kurb ja +lootusetu meeleolu.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/et/iso-8859-15.txt
Added
@@ -0,0 +1,6 @@ +Anton Pavlovit¨ T¨ehhov oli vene näite- ja novellikirjanik ning praktiseeriv arst. + +T¨ehhov on eelkõige tuntud oma novellide poolest. Tema jutustuste tavaliseks +tegevuspaigaks olid vene väikeasulad ja need käsitlesid hingeüksildust, raisatud +õnne jms. Tuntud on ka tema psühholoogilised näidendid, kus valitseb kurb ja +lootusetu meeleolu.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/et/iso-8859-4.txt
Added
@@ -0,0 +1,6 @@ +Anton Pavlovit¹ T¹ehhov oli vene näite- ja novellikirjanik ning praktiseeriv arst. + +T¹ehhov on eelkõige tuntud oma novellide poolest. Tema jutustuste tavaliseks +tegevuspaigaks olid vene väikeasulad ja need käsitlesid hingeüksildust, raisatud +õnne jms. Tuntud on ka tema psühholoogilised näidendid, kus valitseb kurb ja +lootusetu meeleolu.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/et/utf-8.txt
Added
@@ -0,0 +1,6 @@ +Anton Pavlovitš Tšehhov oli vene näite- ja novellikirjanik ning praktiseeriv arst. + +Tšehhov on eelkõige tuntud oma novellide poolest. Tema jutustuste tavaliseks +tegevuspaigaks olid vene väikeasulad ja need käsitlesid hingeüksildust, raisatud +õnne jms. Tuntud on ka tema psühholoogilised näidendid, kus valitseb kurb ja +lootusetu meeleolu.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/et/windows-1252.txt
Added
@@ -0,0 +1,6 @@ +Anton Pavlovit Tehhov oli vene näite- ja novellikirjanik ning praktiseeriv arst. + +Tehhov on eelkõige tuntud oma novellide poolest. Tema jutustuste tavaliseks +tegevuspaigaks olid vene väikeasulad ja need käsitlesid hingeüksildust, raisatud +õnne jms. Tuntud on ka tema psühholoogilised näidendid, kus valitseb kurb ja +lootusetu meeleolu.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/et/windows-1257.txt
Added
@@ -0,0 +1,6 @@ +Anton Pavlovitð Tðehhov oli vene näite- ja novellikirjanik ning praktiseeriv arst. + +Tðehhov on eelkõige tuntud oma novellide poolest. Tema jutustuste tavaliseks +tegevuspaigaks olid vene väikeasulad ja need käsitlesid hingeüksildust, raisatud +õnne jms. Tuntud on ka tema psühholoogilised näidendid, kus valitseb kurb ja +lootusetu meeleolu.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/fi
Added
+(directory)
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/fi/iso-8859-1.txt
Added
@@ -0,0 +1,8 @@ +Termi science fiction on amerikkalaisen tieteislehtien toimittajan Hugo +Gernsbackin keksimä. Suomessa termin tieteiskirjallisuus loi tohtori Eino +Kauppinen 1950-luvun alkupuolella. +Tieteiskirjallisuudelle on laadittu erilaisia määritelmiä. Tieteiskirjallisuuden +rajat eivät ole yksiselitteisen selkeät. Tieteiskirjallisuus lähenee monia +kirjallisuudenlajeja, erityisesti kauhu- ja fantasiakirjallisuutta. Näillä +kolmella lajilla onkin yhteiset juuret 1800-lukua edeltävässä ei-realistisessa +kirjallisuudessa.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/fi/utf-8.txt
Added
@@ -0,0 +1,8 @@ +Termi science fiction on amerikkalaisen tieteislehtien toimittajan Hugo +Gernsbackin keksimä. Suomessa termin tieteiskirjallisuus loi tohtori Eino +Kauppinen 1950-luvun alkupuolella. +Tieteiskirjallisuudelle on laadittu erilaisia määritelmiä. Tieteiskirjallisuuden +rajat eivät ole yksiselitteisen selkeät. Tieteiskirjallisuus lähenee monia +kirjallisuudenlajeja, erityisesti kauhu- ja fantasiakirjallisuutta. Näillä +kolmella lajilla onkin yhteiset juuret 1800-lukua edeltävässä ei-realistisessa +kirjallisuudessa.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/ga
Added
+(directory)
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/ga/iso-8859-1.txt
Added
@@ -0,0 +1,6 @@ +Ag seo téarmaí seoltóireachta a bhaineann le longa adhmaid agus le báid. + +Ní bhíodh de cheangal idir Éire agus tíortha eile ach na longa, agus tá Éire +féin lán de lochanna agus d'aibhneacha. Fágann seo go bhfuil an teanga breac le +téarmaíocht seoltóireachta agus loingseoireachta agus cuid di tugtha isteach ón +Lochlainnis agus ón mBéarla trí lonnaitheoirí ón iasacht.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/ga/utf-8.txt
Added
@@ -0,0 +1,6 @@ +Ag seo téarmaí seoltóireachta a bhaineann le longa adhmaid agus le báid. + +Ní bhíodh de cheangal idir Éire agus tíortha eile ach na longa, agus tá Éire +féin lán de lochanna agus d’aibhneacha. Fágann seo go bhfuil an teanga breac le +téarmaíocht seoltóireachta agus loingseoireachta agus cuid di tugtha isteach ón +Lochlainnis agus ón mBéarla trí lonnaitheoirí ón iasacht.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/ga/windows-1252.txt
Added
@@ -0,0 +1,6 @@ +Ag seo téarmaí seoltóireachta a bhaineann le longa adhmaid agus le báid. + +Ní bhíodh de cheangal idir Éire agus tíortha eile ach na longa, agus tá Éire +féin lán de lochanna agus daibhneacha. Fágann seo go bhfuil an teanga breac le +téarmaíocht seoltóireachta agus loingseoireachta agus cuid di tugtha isteach ón +Lochlainnis agus ón mBéarla trí lonnaitheoirí ón iasacht.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/hr
Added
+(directory)
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/hr/ibm852.txt
Added
@@ -0,0 +1,4 @@ +Brekinja (lat. Sorbus torminalis) je bjelogorina vrsta drvea iz porodice +Rosaceae. +Prirodno je rasprostranjena u zapadnoj, srednjoj i ju§noj Europi, sjevernoj +Africi, Krimu, Maloj Aziji, Kavkazu i Transkavkaziji.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/hr/iso-8859-13.txt
Added
@@ -0,0 +1,4 @@ +Brekinja (lat. Sorbus torminalis) je bjelogorièna vrsta drveãa iz porodice +Rosaceae. +Prirodno je rasprostranjena u zapadnoj, srednjoj i juþnoj Europi, sjevernoj +Africi, Krimu, Maloj Aziji, Kavkazu i Transkavkaziji.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/hr/iso-8859-16.txt
Added
@@ -0,0 +1,4 @@ +Brekinja (lat. Sorbus torminalis) je bjelogori¹na vrsta drveåa iz porodice +Rosaceae. +Prirodno je rasprostranjena u zapadnoj, srednjoj i ju¸noj Europi, sjevernoj +Africi, Krimu, Maloj Aziji, Kavkazu i Transkavkaziji.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/hr/iso-8859-2.txt
Added
@@ -0,0 +1,4 @@ +Brekinja (lat. Sorbus torminalis) je bjelogorièna vrsta drveæa iz porodice +Rosaceae. +Prirodno je rasprostranjena u zapadnoj, srednjoj i ju¾noj Europi, sjevernoj +Africi, Krimu, Maloj Aziji, Kavkazu i Transkavkaziji.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/hr/mac-centraleurope.txt
Added
@@ -0,0 +1,4 @@ +Brekinja (lat. Sorbus torminalis) je bjelogorina vrsta drvea iz porodice +Rosaceae. +Prirodno je rasprostranjena u zapadnoj, srednjoj i juìnoj Europi, sjevernoj +Africi, Krimu, Maloj Aziji, Kavkazu i Transkavkaziji.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/hr/utf-8.txt
Added
@@ -0,0 +1,4 @@ +Brekinja (lat. Sorbus torminalis) je bjelogorična vrsta drveća iz porodice +Rosaceae. +Prirodno je rasprostranjena u zapadnoj, srednjoj i južnoj Europi, sjevernoj +Africi, Krimu, Maloj Aziji, Kavkazu i Transkavkaziji.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/hr/windows-1250.txt
Added
@@ -0,0 +1,4 @@ +Brekinja (lat. Sorbus torminalis) je bjelogorièna vrsta drveæa iz porodice +Rosaceae. +Prirodno je rasprostranjena u zapadnoj, srednjoj i junoj Europi, sjevernoj +Africi, Krimu, Maloj Aziji, Kavkazu i Transkavkaziji.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/it
Added
+(directory)
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/it/iso-8859-1.txt
Added
@@ -0,0 +1,18 @@ +L'architettura longobarda è costituita dall'insieme delle opere architettoniche +realizzate in Italia durante il regno dei Longobardi (568-774), con residuale +permanenza nell'Italia meridionale fino al X-XI secolo (Langobardia Minor), e +commissionate dai re e dai duchi longobardi. +L'attività architettonica sviluppata in Langobardia Maior è andata in gran parte +perduta, per lo più a causa di successive ricostruzioni degli edifici sacri e +profani eretti tra VII e VIII secolo. A parte il Tempietto longobardo di +Cividale del Friuli, rimasto in gran parte intatto, gli edifici civili e +religiosi di Pavia, Monza o altre località sono stati ampiamente rimaneggiati +nei secoli seguenti. Ancora integre rimangono così soltanto poche architetture, +o perché inglobate negli ampliamenti successivi - come la chiesa di San +Salvatore a Brescia) -, o perché periferiche e di modeste dimensioni - come la +chiesa di Santa Maria foris portas a Castelseprio. Testimonianze maggiormente +fedeli alla forma originale si ritrovano, invece, nella Langobardia Minor: a +Benevento si conservano la chiesa di Santa Sofia, un ampio tratto delle Mura e +la Rocca dei Rettori, unici esempi superstiti di architettura militare +longobarda, mentre altre testimonianze si sono conservate in centri minori del +ducato beneventano e a Spoleto.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/it/utf-8.txt
Added
@@ -0,0 +1,18 @@ +L'architettura longobarda è costituita dall'insieme delle opere architettoniche +realizzate in Italia durante il regno dei Longobardi (568-774), con residuale +permanenza nell'Italia meridionale fino al X-XI secolo (Langobardia Minor), e +commissionate dai re e dai duchi longobardi. +L'attività architettonica sviluppata in Langobardia Maior è andata in gran parte +perduta, per lo più a causa di successive ricostruzioni degli edifici sacri e +profani eretti tra VII e VIII secolo. A parte il Tempietto longobardo di +Cividale del Friuli, rimasto in gran parte intatto, gli edifici civili e +religiosi di Pavia, Monza o altre località sono stati ampiamente rimaneggiati +nei secoli seguenti. Ancora integre rimangono così soltanto poche architetture, +o perché inglobate negli ampliamenti successivi - come la chiesa di San +Salvatore a Brescia) -, o perché periferiche e di modeste dimensioni - come la +chiesa di Santa Maria foris portas a Castelseprio. Testimonianze maggiormente +fedeli alla forma originale si ritrovano, invece, nella Langobardia Minor: a +Benevento si conservano la chiesa di Santa Sofia, un ampio tratto delle Mura e +la Rocca dei Rettori, unici esempi superstiti di architettura militare +longobarda, mentre altre testimonianze si sono conservate in centri minori del +ducato beneventano e a Spoleto.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/ko/uhc.smi
Changed
(renamed from test/ko/euc-kr.smi)
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/lt
Added
+(directory)
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/lt/iso-8859-10.txt
Added
@@ -0,0 +1,3 @@ +Vincentas van Gogas (ol. Vincent van Gogh, 1853 m. kovo 30 d. Grot Zunderte, +Nyderlandai - 1890 m. liepos 29 d. Overe prie Uazos, Pranc¾zija) - olandù +tapytojas ir grafikas, postimpresionistas.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/lt/iso-8859-13.txt
Added
@@ -0,0 +1,3 @@ +Vincentas van Gogas (ol. Vincent van Gogh, 1853 m. kovo 30 d. Grot Zunderte, +Nyderlandai - 1890 m. liepos 29 d. Overe prie Uazos, Prancûzija) - olandø +tapytojas ir grafikas, postimpresionistas.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/lt/iso-8859-4.txt
Added
@@ -0,0 +1,3 @@ +Vincentas van Gogas (ol. Vincent van Gogh, 1853 m. kovo 30 d. Grot Zunderte, +Nyderlandai - 1890 m. liepos 29 d. Overe prie Uazos, Prancþzija) - olandù +tapytojas ir grafikas, postimpresionistas.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/lt/utf-8.txt
Added
@@ -0,0 +1,3 @@ +Vincentas van Gogas (ol. Vincent van Gogh, 1853 m. kovo 30 d. Grot Zunderte, +Nyderlandai – 1890 m. liepos 29 d. Overe prie Uazos, Prancūzija) – olandų +tapytojas ir grafikas, postimpresionistas.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/lv
Added
+(directory)
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/lv/iso-8859-10.txt
Added
@@ -0,0 +1,6 @@ +Vinsents Villems van Gogs (n´derlandieºu: Vincent Willem van Gogh, dzimis 1853. +gada 30. martà, miris 1890. gada 29. j¾lijà) bija n´derlandieºu gleznotàjs, +postimpresionisma pàrstàvis. Kopumà van Gogs rad´ja vairàk nekà 2000 darbu, to +skaità 900 gleznu un 1100 z´m²jumu un skièu. Savus slavenàkos darbus viñº rad´ja +p²d²jo divu dz´ves gadu laikà. Tiek uzskat´ts, ka van Gogs b¾tiski ir ietekm²jis +20. gadsimta màkslu, tostarp ekspresionismu un fovismu.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/lv/iso-8859-13.txt
Added
@@ -0,0 +1,6 @@ +Vinsents Villems van Gogs (nîderlandieðu: Vincent Willem van Gogh, dzimis 1853. +gada 30. martâ, miris 1890. gada 29. jûlijâ) bija nîderlandieðu gleznotâjs, +postimpresionisma pârstâvis. Kopumâ van Gogs radîja vairâk nekâ 2000 darbu, to +skaitâ 900 gleznu un 1100 zîmçjumu un skièu. Savus slavenâkos darbus viòð radîja +pçdçjo divu dzîves gadu laikâ. Tiek uzskatîts, ka van Gogs bûtiski ir ietekmçjis +20. gadsimta mâkslu, tostarp ekspresionismu un fovismu.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/lv/iso-8859-4.txt
Added
@@ -0,0 +1,6 @@ +Vinsents Villems van Gogs (nïderlandie¹u: Vincent Willem van Gogh, dzimis 1853. +gada 30. martà, miris 1890. gada 29. jþlijà) bija nïderlandie¹u gleznotàjs, +postimpresionisma pàrstàvis. Kopumà van Gogs radïja vairàk nekà 2000 darbu, to +skaità 900 gleznu un 1100 zïmºjumu un skièu. Savus slavenàkos darbus viñ¹ radïja +pºdºjo divu dzïves gadu laikà. Tiek uzskatïts, ka van Gogs bþtiski ir ietekmºjis +20. gadsimta màkslu, tostarp ekspresionismu un fovismu.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/lv/utf-8.txt
Added
@@ -0,0 +1,6 @@ +Vinsents Villems van Gogs (nīderlandiešu: Vincent Willem van Gogh, dzimis 1853. +gada 30. martā, miris 1890. gada 29. jūlijā) bija nīderlandiešu gleznotājs, +postimpresionisma pārstāvis. Kopumā van Gogs radīja vairāk nekā 2000 darbu, to +skaitā 900 gleznu un 1100 zīmējumu un skiču. Savus slavenākos darbus viņš radīja +pēdējo divu dzīves gadu laikā. Tiek uzskatīts, ka van Gogs būtiski ir ietekmējis +20. gadsimta mākslu, tostarp ekspresionismu un fovismu.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/mt
Added
+(directory)
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/mt/iso-8859-3.txt
Added
@@ -0,0 +1,1 @@ +Il-Malti huwa l-ilsien nazzjonali tar-Repubblika ta' Malta. Huwa l-ilsien uffiåjali flimkien mal-Ingli¿; kif ukoll wie±ed mill-ilsna uffiåjali tal-Unjoni Ewropea. Dan l-ilsien g±andu sisien u g±erq semitiku, ta' djalett G±arbi li õej mit-Tramuntana tal-Afrika, g±al±ekk qatt ma kellu rabta mill-qrib mal-G±arbi Klassiku. I¿da tul i¿-¿minijiet, min±abba proåess tal-Latinizzazzjoni ta' Malta, bdew de±lin bosta elementi lingwistiåi mill-Isqalli, djalett ta' art li wkoll g±addiet minn ¿mien ta' ±akma G±arbija. Wara l-Isqalli beda die±el ukoll it-Taljan, fuq kollox fi¿-¿mien tad-da±la tal-Kavallieri tal-Ordni ta' San Õwann sa meta l-Ingli¿ ±a post it-Taljan b±ala l-ilsien uffiåjali fil-Kostituzzjoni Kolonjali tal-1934. Il-Malti huwa l-ilsien wa±dieni ta' g±ajn semitika li jinkiteb b'ittri Latini. L-alfabett Malti mag±mul minn 30 ittra (24 konsonanti u 6 vokali) li jidhru f'din l-ordni:
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/mt/utf-8.txt
Added
@@ -0,0 +1,4 @@ +Franza (Franċiż:France), uffiċjalment ir-Repubblika Franċiża (Franċiż: +République française), hi pajjiż fl-Ewropa tal-Punent. Il-belt belt kapitali +tagħha hi Pariġi. Hi membru tal-Unjoni Ewropea. Franza hi maqsuma f'22 régions +li huma suddiviżi f' départements.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/no
Added
+(directory)
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/no/ibm865.txt
Added
@@ -0,0 +1,17 @@ +Pangramer brukes som ren underholdning; som skriveeksempel for prve p +hndskrift; som hjelpemiddel til vise en font; eller som huskeregel for +raskt teste tegnsettet i teknisk utstyr som behandler eller viser bokstaver. + + +Sr golfer med klle vant sexquiz p wc i hjemby. +Hvdingens kjre squaw fr litt pizza i Mexico by. +Vr kjre my i cape vde banjo, whist og quiz i taxifila. +IQ-ls WC-boms uten hrsel skjrer god pizza p xylofon. +Vr kjre zulu-my vde banjo, whist og quickstep fra taxi. +Etter quiz og whist m Jo bre fakkellys p vr srgende cox. +Taxisjfren quizet bedre om calypso, watt og klr p hjemveien. +Vr sre Zulu fra badeya spilte jo whist og quickstep i min taxi. +Du t ca fire wienerplser og tok taxi hjem fra byen med re fra quizen. +Jeg begynte fortre en sandwich mens jeg kjrte taxi p vei til quiz. +Quisling var ein klppar til spela jazz p xylofon, men lrte seg aldri spela cembalo fr han drog til Washington. +Hvdingens kjre squaw fr litt pizza i Mexico by.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/no/iso-8859-1.txt
Added
@@ -0,0 +1,20 @@ +Pangramer brukes som ren underholdning; som skriveeksempel for prøve på +håndskrift; som hjelpemiddel til å vise en font; eller som huskeregel for å +raskt teste tegnsettet i teknisk utstyr som behandler eller viser bokstaver. + + +Sær golfer med kølle vant sexquiz på wc i hjemby. +Høvdingens kjære squaw får litt pizza i Mexico by. +Vår kjære møy i cape øvde banjo, whist og quiz i taxifila. +IQ-løs WC-boms uten hørsel skjærer god pizza på xylofon. +Vår kjære zulu-møy øvde banjo, whist og quickstep fra taxi. +Etter quiz og whist må Jo bære fakkellys på vår sørgående cox. +Taxisjåføren quizet bedre om calypso, watt og klær på hjemveien. +Vår sære Zulu fra badeøya spilte jo whist og quickstep i min taxi. +Du åt ca fire wienerpølser og tok taxi hjem fra byen med ære fra quizen. +Jeg begynte å fortære en sandwich mens jeg kjørte taxi på vei til quiz. +Quisling var ein kløppar til å spela jazz på xylofon, men lærte seg aldri å spela cembalo før han drog til Washington. +Høvdingens kjære squaw får litt pizza i Mexico by. + +Et sted, cirka ¾ inn i John Greens siste roman, Skilpadder hele veien ned, +begynte jeg og romanens forteller, Aza Holmes, å gråte helt samtidig
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/no/iso-8859-15.txt
Added
@@ -0,0 +1,21 @@ +Pangramer brukes som ren underholdning; som skriveeksempel for prøve på +håndskrift; som hjelpemiddel til å vise en font; eller som huskeregel for å +raskt teste tegnsettet i teknisk utstyr som behandler eller viser bokstaver. + + +Sær golfer med kølle vant sexquiz på wc i hjemby. +Høvdingens kjære squaw får litt pizza i Mexico by. +Vår kjære møy i cape øvde banjo, whist og quiz i taxifila. +IQ-løs WC-boms uten hørsel skjærer god pizza på xylofon. +Vår kjære zulu-møy øvde banjo, whist og quickstep fra taxi. +Etter quiz og whist må Jo bære fakkellys på vår sørgående cox. +Taxisjåføren quizet bedre om calypso, watt og klær på hjemveien. +Vår sære Zulu fra badeøya spilte jo whist og quickstep i min taxi. +Du åt ca fire wienerpølser og tok taxi hjem fra byen med ære fra quizen. +Jeg begynte å fortære en sandwich mens jeg kjørte taxi på vei til quiz. +Quisling var ein kløppar til å spela jazz på xylofon, men lærte seg aldri å spela cembalo før han drog til Washington. +Høvdingens kjære squaw får litt pizza i Mexico by. + +Euro (symbol: ¤) er den Den europeiske unions myntenhet. Den +er innført i 19 av unionens 27 medlemsland (kjent som eurosonen) og i fire +mikrostater og noen andre land og områder.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/no/utf-8.txt
Added
@@ -0,0 +1,20 @@ +Pangramer brukes som ren underholdning; som skriveeksempel for prøve på +håndskrift; som hjelpemiddel til å vise en font; eller som huskeregel for å +raskt teste tegnsettet i teknisk utstyr som behandler eller viser bokstaver. + + +Sær golfer med kølle vant sexquiz på wc i hjemby. +Høvdingens kjære squaw får litt pizza i Mexico by. +Vår kjære møy i cape øvde banjo, whist og quiz i taxifila. +IQ-løs WC-boms uten hørsel skjærer god pizza på xylofon. +Vår kjære zulu-møy øvde banjo, whist og quickstep fra taxi. +Etter quiz og whist må Jo bære fakkellys på vår sørgående cox. +Taxisjåføren quizet bedre om calypso, watt og klær på hjemveien. +Vår sære Zulu fra badeøya spilte jo whist og quickstep i min taxi. +Du åt ca fire wienerpølser og tok taxi hjem fra byen med ære fra quizen. +Jeg begynte å fortære en sandwich mens jeg kjørte taxi på vei til quiz. +Quisling var ein kløppar til å spela jazz på xylofon, men lærte seg aldri å spela cembalo før han drog til Washington. +Høvdingens kjære squaw får litt pizza i Mexico by. + +Et sted, cirka ¾ inn i John Greens siste roman, Skilpadder hele veien ned, +begynte jeg og romanens forteller, Aza Holmes, å gråte helt samtidig
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/no/windows-1252.txt
Added
@@ -0,0 +1,21 @@ +Pangramer brukes som ren underholdning; som skriveeksempel for prøve på +håndskrift; som hjelpemiddel til å vise en font; eller som huskeregel for å +raskt teste tegnsettet i teknisk utstyr som behandler eller viser bokstaver. + + +Sær golfer med kølle vant sexquiz på wc i hjemby. +Høvdingens kjære squaw får litt pizza i Mexico by. +Vår kjære møy i cape øvde banjo, whist og quiz i taxifila. +IQ-løs WC-boms uten hørsel skjærer god pizza på xylofon. +Vår kjære zulu-møy øvde banjo, whist og quickstep fra taxi. +Etter quiz og whist må Jo bære fakkellys på vår sørgående cox. +Taxisjåføren quizet bedre om calypso, watt og klær på hjemveien. +Vår sære Zulu fra badeøya spilte jo whist og quickstep i min taxi. +Du åt ca fire wienerpølser og tok taxi hjem fra byen med ære fra quizen. +Jeg begynte å fortære en sandwich mens jeg kjørte taxi på vei til quiz. +Quisling var ein kløppar til å spela jazz på xylofon, men lærte seg aldri å spela cembalo før han drog til Washington. +Høvdingens kjære squaw får litt pizza i Mexico by. + +Euro (symbol: valutakode: EUR) er den Den europeiske unions myntenhet. Den +er innført i 19 av unionens 27 medlemsland (kjent som eurosonen) og i fire +mikrostater og noen andre land og områder.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/pl
Added
+(directory)
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/pl/ibm852.txt
Added
@@ -0,0 +1,3 @@ +Zofia (Sonka) Holszaäska herbu Hippocentaurus (ur. ok. 1405, zm. 21 wrzenia 1461 w Krakowie) +ksi©¾niczka litewska, kr¢lowa Polski, od 1422 roku czwarta i ostatnia ¾ona Wadysawa II +Jagiey.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/pl/iso-8859-13.txt
Added
@@ -0,0 +1,3 @@ +Zofia (Sonka) Holszañska herbu Hippocentaurus (ur. ok. 1405, zm. 21 wrzeúnia 1461 w Krakowie) +ksiæýniczka litewska, królowa Polski, od 1422 roku czwarta i ostatnia ýona Wùadysùawa II +Jagieùùy.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/pl/iso-8859-16.txt
Added
@@ -0,0 +1,3 @@ +Zofia (Sonka) Holszañska herbu Hippocentaurus (ur. ok. 1405, zm. 21 wrze÷nia 1461 w Krakowie) +ksiý¿niczka litewska, królowa Polski, od 1422 roku czwarta i ostatnia ¿ona W³adys³awa II +Jagie³³y.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/pl/iso-8859-2.txt
Added
@@ -0,0 +1,3 @@ +Zofia (Sonka) Holszañska herbu Hippocentaurus (ur. ok. 1405, zm. 21 wrze¶nia 1461 w Krakowie) +ksiê¿niczka litewska, królowa Polski, od 1422 roku czwarta i ostatnia ¿ona W³adys³awa II +Jagie³³y.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/pl/mac-centraleurope.txt
Added
@@ -0,0 +1,3 @@ +Zofia (Sonka) HolszaÄska herbu Hippocentaurus (ur. ok. 1405, zm. 21 wrzeænia 1461 w Krakowie) +ksi«ýniczka litewska, krlowa Polski, od 1422 roku czwarta i ostatnia ýona W¸adys¸awa II +Jagie¸¸y.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/pl/utf-8.txt
Added
@@ -0,0 +1,3 @@ +Zofia (Sonka) Holszańska herbu Hippocentaurus (ur. ok. 1405, zm. 21 września 1461 w Krakowie) +księżniczka litewska, królowa Polski, od 1422 roku czwarta i ostatnia żona Władysława II +Jagiełły.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/pl/windows-1250.txt
Added
@@ -0,0 +1,3 @@ +Zofia (Sonka) Holszañska herbu Hippocentaurus (ur. ok. 1405, zm. 21 wrzenia 1461 w Krakowie) +ksiê¿niczka litewska, królowa Polski, od 1422 roku czwarta i ostatnia ¿ona W³adys³awa II +Jagie³³y.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/pt
Added
+(directory)
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/pt/iso-8859-1.txt
Added
@@ -0,0 +1,6 @@ +Albertossauro (Albertosaurus sp., que significa "lagarto de Alberta" no Canadá), +foi um género de dinossauro carnívoro e bípede presente no fim do período +Cretáceo. Media cerca de 8 a 9 metros de comprimento, 3 metros de altura e +pesava menos de 2 toneladas. O Albertossauro viveu na América do Norte e foi +descoberto no ano de 1884 por Joseph Burr Tyrrell em Alberta, no Canadá, local +ao qual deve seu nome.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/pt/utf-8.txt
Added
@@ -0,0 +1,6 @@ +Albertossauro (Albertosaurus sp., que significa "lagarto de Alberta" no Canadá), +foi um género de dinossauro carnívoro e bípede presente no fim do período +Cretáceo. Media cerca de 8 a 9 metros de comprimento, 3 metros de altura e +pesava menos de 2 toneladas. O Albertossauro viveu na América do Norte e foi +descoberto no ano de 1884 por Joseph Burr Tyrrell em Alberta, no Canadá, local +ao qual deve seu nome.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/ro
Added
+(directory)
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/ro/ibm852.txt
Added
@@ -0,0 +1,9 @@ +Danemarca (n danezÇ Sunet Danmark), oficial Regatul Danemarcei (n +danezÇ Sunet Kongeriget Danmark), este un stat suveran din +Europa de Nord, avnd si douÇ tÇri constituente de peste mÇri, care fac parte +integrantÇ din regat: Insulele Feroe n Atlanticul de Nord si Groenlanda n +America de Nord. Danemarca propriu-zisÇa este cea mai de sud dintre tÇrile +nordice, aflatÇ la sud-vest de Suedia si la sud de Norvegia, nvecinndu-se la +sud cu Germania. Tara constÇ dintr-o peninsulÇ mare, Iutlanda, si mai multe +insule, dintre care cele mai mari sunt Zealand, Funen, Lolland, Falster si +Bornholm, precum si sute de insulite denumite n general ,,Arhipelagul Danez".
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/ro/iso-8859-16.txt
Added
@@ -0,0 +1,9 @@ +Danemarca (în danezã Sunet Danmark), oficial Regatul Danemarcei (în +danezã Sunet Kongeriget Danmark), este un stat suveran din +Europa de Nord, având ºi douã þãri constituente de peste mãri, care fac parte +integrantã din regat: Insulele Feroe în Atlanticul de Nord ºi Groenlanda în +America de Nord. Danemarca propriu-zisãa este cea mai de sud dintre þãrile +nordice, aflatã la sud-vest de Suedia ºi la sud de Norvegia, învecinându-se la +sud cu Germania. Þara constã dintr-o peninsulã mare, Iutlanda, ºi mai multe +insule, dintre care cele mai mari sunt Zealand, Funen, Lolland, Falster ºi +Bornholm, precum ºi sute de insuliþe denumite în general ¥Arhipelagul Danezµ.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/ro/utf-8.txt
Added
@@ -0,0 +1,9 @@ +Danemarca (în daneză Sunet Danmark), oficial Regatul Danemarcei (în +daneză Sunet Kongeriget Danmark), este un stat suveran din +Europa de Nord, având și două țări constituente de peste mări, care fac parte +integrantă din regat: Insulele Feroe în Atlanticul de Nord și Groenlanda în +America de Nord. Danemarca propriu-zisăa este cea mai de sud dintre țările +nordice, aflată la sud-vest de Suedia și la sud de Norvegia, învecinându-se la +sud cu Germania. Țara constă dintr-o peninsulă mare, Iutlanda, și mai multe +insule, dintre care cele mai mari sunt Zealand, Funen, Lolland, Falster și +Bornholm, precum și sute de insulițe denumite în general „Arhipelagul Danez”.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/ro/windows-1250.txt
Added
@@ -0,0 +1,9 @@ +Danemarca (în danezã Sunet Danmark), oficial Regatul Danemarcei (în +danezã Sunet Kongeriget Danmark), este un stat suveran din +Europa de Nord, având si douã tãri constituente de peste mãri, care fac parte +integrantã din regat: Insulele Feroe în Atlanticul de Nord si Groenlanda în +America de Nord. Danemarca propriu-zisãa este cea mai de sud dintre tãrile +nordice, aflatã la sud-vest de Suedia si la sud de Norvegia, învecinându-se la +sud cu Germania. Tara constã dintr-o peninsulã mare, Iutlanda, si mai multe +insule, dintre care cele mai mari sunt Zealand, Funen, Lolland, Falster si +Bornholm, precum si sute de insulite denumite în general Arhipelagul Danez.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/sk
Added
+(directory)
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/sk/ibm852.txt
Added
@@ -0,0 +1,3 @@ +Jupiter je piata planta v porad¡ od Slnka, najvçia a najhmotnejçia planta +naçej slnenej s£stavy. Je pomenovanì po r¡mskom bohovi Jupiterovi. Symbolom +planty je çtylizovan zn zornenie Jupiterovho bo§skho blesku.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/sk/iso-8859-2.txt
Added
@@ -0,0 +1,3 @@ +Jupiter je piata planéta v poradí od Slnka, najväè¹ia a najhmotnej¹ia planéta +na¹ej slneènej sústavy. Je pomenovaný po rímskom bohovi Jupiterovi. Symbolom +planéty je ¹tylizované znázornenie Jupiterovho bo¾ského blesku.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/sk/mac-centraleurope.txt
Added
@@ -0,0 +1,3 @@ +Jupiter je piata planta v porad od Slnka, najväia a najhmotnejäia planta +naäej slnenej sstavy. Je pomenovanù po rmskom bohovi Jupiterovi. Symbolom +planty je ätylizovan znzornenie Jupiterovho boìskho blesku.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/sk/utf-8.txt
Added
@@ -0,0 +1,3 @@ +Jupiter je piata planéta v poradí od Slnka, najväčšia a najhmotnejšia planéta +našej slnečnej sústavy. Je pomenovaný po rímskom bohovi Jupiterovi. Symbolom +planéty je štylizované znázornenie Jupiterovho božského blesku.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/sk/windows-1250.txt
Added
@@ -0,0 +1,3 @@ +Jupiter je piata planéta v poradí od Slnka, najväèia a najhmotnejia planéta +naej slneènej sústavy. Je pomenovaný po rímskom bohovi Jupiterovi. Symbolom +planéty je tylizované znázornenie Jupiterovho boského blesku.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/sl
Added
+(directory)
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/sl/ibm852.txt
Added
@@ -0,0 +1,9 @@ +Naselj¡vi plant je planet ali naravni satelit (redkeje tudi asteroid1), ki je +zmo§en razviti in ohranjati §ivljenje. + +Ker je obstoj nezemeljskega §ivljenja trenutno negotov, je raziskovanje +naseljivih planetov v glavnem ekstrapolacija razmer na Zemlji in znailnosti +Sonca in celotnega Osonja, ki govorijo v prid razvitju §ivljenja. æe posebej so +pomembni faktorji, ki so ohranili zapletene, mnogoceline organizme in ne le +preprosta, enocelina §iva bitja, mikroorganizme. Raziskovanje in teorija v tej +smeri je del planetologije in razvijajoe astrobiologije.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/sl/iso-8859-16.txt
Added
@@ -0,0 +1,9 @@ +Naseljívi planét je planet ali naravni satelit (redkeje tudi asteroid1), ki je +zmo¸en razviti in ohranjati ¸ivljenje. + +Ker je obstoj nezemeljskega ¸ivljenja trenutno negotov, je raziskovanje +naseljivih planetov v glavnem ekstrapolacija razmer na Zemlji in zna¹ilnosti +Sonca in celotnega Oson¹ja, ki govorijo v prid razvitju ¸ivljenja. ¦e posebej so +pomembni faktorji, ki so ohranili zapletene, mnogoceli¹ne organizme in ne le +preprosta, enoceli¹na ¸iva bitja, mikroorganizme. Raziskovanje in teorija v tej +smeri je del planetologije in razvijajo¹e astrobiologije.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/sl/iso-8859-2.txt
Added
@@ -0,0 +1,9 @@ +Naseljívi planét je planet ali naravni satelit (redkeje tudi asteroid1), ki je +zmo¾en razviti in ohranjati ¾ivljenje. + +Ker je obstoj nezemeljskega ¾ivljenja trenutno negotov, je raziskovanje +naseljivih planetov v glavnem ekstrapolacija razmer na Zemlji in znaèilnosti +Sonca in celotnega Osonèja, ki govorijo v prid razvitju ¾ivljenja. ©e posebej so +pomembni faktorji, ki so ohranili zapletene, mnogoceliène organizme in ne le +preprosta, enocelièna ¾iva bitja, mikroorganizme. Raziskovanje in teorija v tej +smeri je del planetologije in razvijajoèe astrobiologije.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/sl/mac-centraleurope.txt
Added
@@ -0,0 +1,9 @@ +Naseljvi plant je planet ali naravni satelit (redkeje tudi asteroid1), ki je +zmoìen razviti in ohranjati ìivljenje. + +Ker je obstoj nezemeljskega ìivljenja trenutno negotov, je raziskovanje +naseljivih planetov v glavnem ekstrapolacija razmer na Zemlji in znailnosti +Sonca in celotnega Osonja, ki govorijo v prid razvitju ìivljenja. áe posebej so +pomembni faktorji, ki so ohranili zapletene, mnogoceline organizme in ne le +preprosta, enocelina ìiva bitja, mikroorganizme. Raziskovanje in teorija v tej +smeri je del planetologije in razvijajoe astrobiologije.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/sl/utf-8.txt
Added
@@ -0,0 +1,9 @@ +Naseljívi planét je planet ali naravni satelit (redkeje tudi asteroid1), ki je +zmožen razviti in ohranjati življenje. + +Ker je obstoj nezemeljskega življenja trenutno negotov, je raziskovanje +naseljivih planetov v glavnem ekstrapolacija razmer na Zemlji in značilnosti +Sonca in celotnega Osončja, ki govorijo v prid razvitju življenja. Še posebej so +pomembni faktorji, ki so ohranili zapletene, mnogocelične organizme in ne le +preprosta, enocelična živa bitja, mikroorganizme. Raziskovanje in teorija v tej +smeri je del planetologije in razvijajoče astrobiologije.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/sl/windows-1250.txt
Added
@@ -0,0 +1,9 @@ +Naseljívi planét je planet ali naravni satelit (redkeje tudi asteroid1), ki je +zmoen razviti in ohranjati ivljenje. + +Ker je obstoj nezemeljskega ivljenja trenutno negotov, je raziskovanje +naseljivih planetov v glavnem ekstrapolacija razmer na Zemlji in znaèilnosti +Sonca in celotnega Osonèja, ki govorijo v prid razvitju ivljenja. e posebej so +pomembni faktorji, ki so ohranili zapletene, mnogoceliène organizme in ne le +preprosta, enocelièna iva bitja, mikroorganizme. Raziskovanje in teorija v tej +smeri je del planetologije in razvijajoèe astrobiologije.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/sv
Added
+(directory)
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/sv/iso-8859-1.txt
Added
@@ -0,0 +1,10 @@ +Mölle är en tätort på Kullahalvön i Brunnby socken i Höganäs kommun, Skåne län. + +Samhället var från början ett fiskeläge, men kom att spela en stor roll i den +framväxande turismen i Sverige i slutet av 1800-talet. Till detta bidrog - och +bidrar - Mölles natursköna läge invid Öresunds norra utlopp, med Kullaberg som +bakgrund. Gemensamhetsbad för män och kvinnor introducerades i Ransvik i början +av 1900-talet. Storhetstiden som turistort inträffade strax före första +världskriget, men även under mellankrigstiden var turistströmmarna stora. +Fortfarande är Mölle en populär turistort med en tredubbling av invånarantalet +under sommarmånaderna.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/sv/utf-8.txt
Added
@@ -0,0 +1,10 @@ +Mölle är en tätort på Kullahalvön i Brunnby socken i Höganäs kommun, Skåne län. + +Samhället var från början ett fiskeläge, men kom att spela en stor roll i den +framväxande turismen i Sverige i slutet av 1800-talet. Till detta bidrog – och +bidrar – Mölles natursköna läge invid Öresunds norra utlopp, med Kullaberg som +bakgrund. Gemensamhetsbad för män och kvinnor introducerades i Ransvik i början +av 1900-talet. Storhetstiden som turistort inträffade strax före första +världskriget, men även under mellankrigstiden var turistströmmarna stora. +Fortfarande är Mölle en populär turistort med en tredubbling av invånarantalet +under sommarmånaderna.
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/test/sv/windows-1252.txt
Added
@@ -0,0 +1,10 @@ +Mölle är en tätort på Kullahalvön i Brunnby socken i Höganäs kommun, Skåne län. + +Samhället var från början ett fiskeläge, men kom att spela en stor roll i den +framväxande turismen i Sverige i slutet av 1800-talet. Till detta bidrog och +bidrar Mölles natursköna läge invid Öresunds norra utlopp, med Kullaberg som +bakgrund. Gemensamhetsbad för män och kvinnor introducerades i Ransvik i början +av 1900-talet. Storhetstiden som turistort inträffade strax före första +världskriget, men även under mellankrigstiden var turistströmmarna stora. +Fortfarande är Mölle en populär turistort med en tredubbling av invånarantalet +under sommarmånaderna.
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/test/uchardet-tests.c -> _service:tar_scm:uchardet-0.0.8.tar.xz/test/uchardet-tests.c
Changed
@@ -52,9 +52,11 @@ char bufferBUFFER_SIZE; int i; - while (!feof(fp)) + while (1) { size_t len = fread(buffer, 1, BUFFER_SIZE, fp); + if (len == 0) + break; int retval = uchardet_handle_data(handle, buffer, len); if (retval != 0) { @@ -100,7 +102,9 @@ { /* Error opening the test file. */ fprintf(stderr, - "uchardet-tests: error opening the test file\n"); + "uchardet-tests: error opening the test file \"%s\"\n", + filename); + free(filename); return 1; } @@ -120,6 +124,9 @@ /* In a unit test, 0 means success, other returned values mean failure. */ success = (strcmp(charset, expected_charset) != 0); + if (success) { + fprintf(stderr, "Got %s, expected %s\n", charset, expected_charset); + } free(charset); free(filename);
View file
_service:tar_scm:uchardet-0.0.8.tar.xz/uchardet-config.cmake.in
Added
@@ -0,0 +1,19 @@ +# This file may optionally do: +# +# 1. Check for dependencies of exported targets. Example: +# +# include(CMakeFindDependencyMacro) +# find_dependency(MYDEP REQUIRED) +# +# find_dependency() has the same syntax as find_package() +# +# 2. Capture values from configuration. Example: +# +# set(my-config-var @my-config-var@) +# +# 3. Other required setup when importing targets from another project +# +# See also: +# https://cliutils.gitlab.io/modern-cmake/chapters/install.html +# +include("${CMAKE_CURRENT_LIST_DIR}/uchardet-targets.cmake")
View file
_service:tar_scm:uchardet-0.0.6.tar.xz/uchardet.doap -> _service:tar_scm:uchardet-0.0.8.tar.xz/uchardet.doap
Changed
@@ -33,7 +33,7 @@ <download-page rdf:resource="https://www.freedesktop.org/software/uchardet/releases/" /> - <bug-database rdf:resource="https://bugs.freedesktop.org/enter_bug.cgi?product=uchardet" /> + <bug-database rdf:resource="https://gitlab.freedesktop.org/uchardet/uchardet/-/issues" /> <programming-language>C</programming-language> <programming-language>C++</programming-language>
Locations
Projects
Search
Status Monitor
Help
Open Build Service
OBS Manuals
API Documentation
OBS Portal
Reporting a Bug
Contact
Mailing List
Forums
Chat (IRC)
Twitter
Open Build Service (OBS)
is an
openSUSE project
.
浙ICP备2022010568号-2