Source code for latexcodec.codec

# -*- coding: utf-8 -*-
"""
    LaTeX Codec
    ~~~~~~~~~~~

    The :mod:`latexcodec.codec` module
    contains all classes and functions for LaTeX code
    translation. For practical use,
    you should only ever need to import the :mod:`latexcodec` module,
    which will automatically register the codec
    so it can be used by :meth:`str.encode`, :meth:`str.decode`,
    and any of the functions defined in the :mod:`codecs` module
    such as :func:`codecs.open` and so on.
    The other functions and classes
    are exposed in case someone would want to extend them.

    .. autofunction:: register

    .. autofunction:: find_latex

    .. autoclass:: LatexIncrementalEncoder
        :show-inheritance:
        :members:

    .. autoclass:: LatexIncrementalDecoder
        :show-inheritance:
        :members:

    .. autoclass:: LatexCodec
        :show-inheritance:
        :members:

    .. autoclass:: LatexUnicodeTable
        :members:
"""

# Copyright (c) 2003, 2008 David Eppstein
# Copyright (c) 2011-2014 Matthias C. M. Troffaes
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation
# files (the "Software"), to deal in the Software without
# restriction, including without limitation the rights to use,
# copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following
# conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.

from __future__ import print_function

import codecs
from six import string_types
from six.moves import range

from latexcodec import lexer


[docs]def register(): """Register the :func:`find_latex` codec search function. .. seealso:: :func:`codecs.register` """ codecs.register(find_latex)
# returns the codec search function # this is used if latex_codec.py were to be placed in stdlib def getregentry(): """Encodings module API.""" return find_latex('latex')
[docs]class LatexUnicodeTable: """Tabulates a translation between LaTeX and unicode.""" def __init__(self, lexer): self.lexer = lexer self.unicode_map = {} self.max_length = 0 self.latex_map = {} self.register_all()
[docs] def register_all(self): """Register all symbols and their LaTeX equivalents (called by constructor). """ # TODO complete this list # register special symbols self.register(u'\n\n', b' \\par', encode=False) self.register(u'\n\n', b'\\par', encode=False) self.register(u' ', b'\\ ', encode=False) self.register(u'\N{EM SPACE}', b'\\quad') self.register(u'%', b'\\%') self.register(u'\N{EN DASH}', b'--') self.register(u'\N{EN DASH}', b'\\textendash') self.register(u'\N{EM DASH}', b'---') self.register(u'\N{EM DASH}', b'\\textemdash') self.register(u'\N{LEFT SINGLE QUOTATION MARK}', b'`', decode=False) self.register(u'\N{RIGHT SINGLE QUOTATION MARK}', b"'", decode=False) self.register(u'\N{LEFT DOUBLE QUOTATION MARK}', b'``') self.register(u'\N{RIGHT DOUBLE QUOTATION MARK}', b"''") self.register(u'\N{DOUBLE LOW-9 QUOTATION MARK}', b'\\glqq') self.register(u'\N{DAGGER}', b'\\dag') self.register(u'\N{DOUBLE DAGGER}', b'\\ddag') self.register(u'\\', b'\\textbackslash', encode=False) self.register(u'\\', b'\\backslash', mode='math', encode=False) self.register(u'\N{TILDE OPERATOR}', b'\\sim', mode='math') self.register(u'\N{MODIFIER LETTER LOW TILDE}', b'\\texttildelow', package='textcomp') self.register(u'\N{SMALL TILDE}', b'\\~{}') self.register(u'~', b'\\textasciitilde') self.register(u'\N{BULLET}', b'\\bullet', mode='math') self.register(u'\N{BULLET}', b'\\textbullet', package='textcomp') self.register(u'\N{NUMBER SIGN}', b'\\#') self.register(u'\N{LOW LINE}', b'\\_') self.register(u'\N{AMPERSAND}', b'\\&') self.register(u'\N{NO-BREAK SPACE}', b'~') self.register(u'\N{INVERTED EXCLAMATION MARK}', b'!`') self.register(u'\N{CENT SIGN}', b'\\not{c}') self.register(u'\N{POUND SIGN}', b'\\pounds') self.register(u'\N{POUND SIGN}', b'\\textsterling', package='textcomp') self.register(u'\N{SECTION SIGN}', b'\\S') self.register(u'\N{DIAERESIS}', b'\\"{}') self.register(u'\N{NOT SIGN}', b'\\neg') self.register(u'\N{HYPHEN}', b'-', decode=False) self.register(u'\N{SOFT HYPHEN}', b'\\-') self.register(u'\N{MACRON}', b'\\={}') self.register(u'\N{DEGREE SIGN}', b'^\\circ', mode='math') self.register(u'\N{DEGREE SIGN}', b'\\textdegree', package='textcomp') self.register(u'\N{MINUS SIGN}', b'-', mode='math') self.register(u'\N{PLUS-MINUS SIGN}', b'\\pm', mode='math') self.register(u'\N{PLUS-MINUS SIGN}', b'\\textpm', package='textcomp') self.register(u'\N{SUPERSCRIPT TWO}', b'^2', mode='math') self.register( u'\N{SUPERSCRIPT TWO}', b'\\texttwosuperior', package='textcomp') self.register(u'\N{SUPERSCRIPT THREE}', b'^3', mode='math') self.register( u'\N{SUPERSCRIPT THREE}', b'\\textthreesuperior', package='textcomp') self.register(u'\N{ACUTE ACCENT}', b"\\'{}") self.register(u'\N{MICRO SIGN}', b'\\mu', mode='math') self.register(u'\N{MICRO SIGN}', b'\\micro', package='gensymb') self.register(u'\N{PILCROW SIGN}', b'\\P') self.register(u'\N{MIDDLE DOT}', b'\\cdot', mode='math') self.register( u'\N{MIDDLE DOT}', b'\\textperiodcentered', package='textcomp') self.register(u'\N{CEDILLA}', b'\\c{}') self.register(u'\N{SUPERSCRIPT ONE}', b'^1', mode='math') self.register( u'\N{SUPERSCRIPT ONE}', b'\\textonesuperior', package='textcomp') self.register(u'\N{INVERTED QUESTION MARK}', b'?`') self.register(u'\N{LATIN CAPITAL LETTER A WITH GRAVE}', b'\\`A') self.register(u'\N{LATIN CAPITAL LETTER A WITH CIRCUMFLEX}', b'\\^A') self.register(u'\N{LATIN CAPITAL LETTER A WITH TILDE}', b'\\~A') self.register(u'\N{LATIN CAPITAL LETTER A WITH DIAERESIS}', b'\\"A') self.register(u'\N{LATIN CAPITAL LETTER A WITH RING ABOVE}', b'\\AA') self.register(u'\N{LATIN CAPITAL LETTER A WITH RING ABOVE}', b'\\r A', encode=False) self.register(u'\N{LATIN CAPITAL LETTER AE}', b'\\AE') self.register(u'\N{LATIN CAPITAL LETTER C WITH CEDILLA}', b'\\c C') self.register(u'\N{LATIN CAPITAL LETTER E WITH GRAVE}', b'\\`E') self.register(u'\N{LATIN CAPITAL LETTER E WITH ACUTE}', b"\\'E") self.register(u'\N{LATIN CAPITAL LETTER E WITH CIRCUMFLEX}', b'\\^E') self.register(u'\N{LATIN CAPITAL LETTER E WITH DIAERESIS}', b'\\"E') self.register(u'\N{LATIN CAPITAL LETTER I WITH GRAVE}', b'\\`I') self.register(u'\N{LATIN CAPITAL LETTER I WITH CIRCUMFLEX}', b'\\^I') self.register(u'\N{LATIN CAPITAL LETTER I WITH DIAERESIS}', b'\\"I') self.register(u'\N{LATIN CAPITAL LETTER N WITH TILDE}', b'\\~N') self.register(u'\N{LATIN CAPITAL LETTER O WITH GRAVE}', b'\\`O') self.register(u'\N{LATIN CAPITAL LETTER O WITH ACUTE}', b"\\'O") self.register(u'\N{LATIN CAPITAL LETTER O WITH CIRCUMFLEX}', b'\\^O') self.register(u'\N{LATIN CAPITAL LETTER O WITH TILDE}', b'\\~O') self.register(u'\N{LATIN CAPITAL LETTER O WITH DIAERESIS}', b'\\"O') self.register(u'\N{MULTIPLICATION SIGN}', b'\\times', mode='math') self.register(u'\N{LATIN CAPITAL LETTER O WITH STROKE}', b'\\O') self.register(u'\N{LATIN CAPITAL LETTER U WITH GRAVE}', b'\\`U') self.register(u'\N{LATIN CAPITAL LETTER U WITH ACUTE}', b"\\'U") self.register(u'\N{LATIN CAPITAL LETTER U WITH CIRCUMFLEX}', b'\\^U') self.register(u'\N{LATIN CAPITAL LETTER U WITH DIAERESIS}', b'\\"U') self.register(u'\N{LATIN CAPITAL LETTER Y WITH ACUTE}', b"\\'Y") self.register(u'\N{LATIN SMALL LETTER SHARP S}', b'\\ss') self.register(u'\N{LATIN SMALL LETTER A WITH GRAVE}', b'\\`a') self.register(u'\N{LATIN SMALL LETTER A WITH ACUTE}', b"\\'a") self.register(u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}', b'\\^a') self.register(u'\N{LATIN SMALL LETTER A WITH TILDE}', b'\\~a') self.register(u'\N{LATIN SMALL LETTER A WITH DIAERESIS}', b'\\"a') self.register(u'\N{LATIN SMALL LETTER A WITH RING ABOVE}', b'\\aa') self.register(u'\N{LATIN SMALL LETTER A WITH RING ABOVE}', b'\\r a', encode=False) self.register(u'\N{LATIN SMALL LETTER AE}', b'\\ae') self.register(u'\N{LATIN SMALL LETTER C WITH CEDILLA}', b'\\c c') self.register(u'\N{LATIN SMALL LETTER E WITH GRAVE}', b'\\`e') self.register(u'\N{LATIN SMALL LETTER E WITH ACUTE}', b"\\'e") self.register(u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}', b'\\^e') self.register(u'\N{LATIN SMALL LETTER E WITH DIAERESIS}', b'\\"e') self.register(u'\N{LATIN SMALL LETTER I WITH GRAVE}', b'\\`\\i') self.register(u'\N{LATIN SMALL LETTER I WITH GRAVE}', b'\\`i') self.register(u'\N{LATIN SMALL LETTER I WITH ACUTE}', b"\\'\\i") self.register(u'\N{LATIN SMALL LETTER I WITH ACUTE}', b"\\'i") self.register(u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}', b'\\^\\i') self.register(u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}', b'\\^i') self.register(u'\N{LATIN SMALL LETTER I WITH DIAERESIS}', b'\\"\\i') self.register(u'\N{LATIN SMALL LETTER I WITH DIAERESIS}', b'\\"i') self.register(u'\N{LATIN SMALL LETTER N WITH TILDE}', b'\\~n') self.register(u'\N{LATIN SMALL LETTER O WITH GRAVE}', b'\\`o') self.register(u'\N{LATIN SMALL LETTER O WITH ACUTE}', b"\\'o") self.register(u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}', b'\\^o') self.register(u'\N{LATIN SMALL LETTER O WITH TILDE}', b'\\~o') self.register(u'\N{LATIN SMALL LETTER O WITH DIAERESIS}', b'\\"o') self.register(u'\N{DIVISION SIGN}', b'\\div', mode='math') self.register(u'\N{LATIN SMALL LETTER O WITH STROKE}', b'\\o') self.register(u'\N{LATIN SMALL LETTER U WITH GRAVE}', b'\\`u') self.register(u'\N{LATIN SMALL LETTER U WITH ACUTE}', b"\\'u") self.register(u'\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}', b'\\^u') self.register(u'\N{LATIN SMALL LETTER U WITH DIAERESIS}', b'\\"u') self.register(u'\N{LATIN SMALL LETTER Y WITH ACUTE}', b"\\'y") self.register(u'\N{LATIN SMALL LETTER Y WITH DIAERESIS}', b'\\"y') self.register(u'\N{LATIN CAPITAL LETTER A WITH MACRON}', b'\\=A') self.register(u'\N{LATIN SMALL LETTER A WITH MACRON}', b'\\=a') self.register(u'\N{LATIN CAPITAL LETTER A WITH BREVE}', b'\\u A') self.register(u'\N{LATIN SMALL LETTER A WITH BREVE}', b'\\u a') self.register(u'\N{LATIN CAPITAL LETTER A WITH OGONEK}', b'\\k A') self.register(u'\N{LATIN SMALL LETTER A WITH OGONEK}', b'\\k a') self.register(u'\N{LATIN CAPITAL LETTER C WITH ACUTE}', b"\\'C") self.register(u'\N{LATIN SMALL LETTER C WITH ACUTE}', b"\\'c") self.register(u'\N{LATIN CAPITAL LETTER C WITH CIRCUMFLEX}', b'\\^C') self.register(u'\N{LATIN SMALL LETTER C WITH CIRCUMFLEX}', b'\\^c') self.register(u'\N{LATIN CAPITAL LETTER C WITH DOT ABOVE}', b'\\.C') self.register(u'\N{LATIN SMALL LETTER C WITH DOT ABOVE}', b'\\.c') self.register(u'\N{LATIN CAPITAL LETTER C WITH CARON}', b'\\v C') self.register(u'\N{LATIN SMALL LETTER C WITH CARON}', b'\\v c') self.register(u'\N{LATIN CAPITAL LETTER D WITH CARON}', b'\\v D') self.register(u'\N{LATIN SMALL LETTER D WITH CARON}', b'\\v d') self.register(u'\N{LATIN CAPITAL LETTER E WITH MACRON}', b'\\=E') self.register(u'\N{LATIN SMALL LETTER E WITH MACRON}', b'\\=e') self.register(u'\N{LATIN CAPITAL LETTER E WITH BREVE}', b'\\u E') self.register(u'\N{LATIN SMALL LETTER E WITH BREVE}', b'\\u e') self.register(u'\N{LATIN CAPITAL LETTER E WITH DOT ABOVE}', b'\\.E') self.register(u'\N{LATIN SMALL LETTER E WITH DOT ABOVE}', b'\\.e') self.register(u'\N{LATIN CAPITAL LETTER E WITH OGONEK}', b'\\k E') self.register(u'\N{LATIN SMALL LETTER E WITH OGONEK}', b'\\k e') self.register(u'\N{LATIN CAPITAL LETTER E WITH CARON}', b'\\v E') self.register(u'\N{LATIN SMALL LETTER E WITH CARON}', b'\\v e') self.register(u'\N{LATIN CAPITAL LETTER G WITH CIRCUMFLEX}', b'\\^G') self.register(u'\N{LATIN SMALL LETTER G WITH CIRCUMFLEX}', b'\\^g') self.register(u'\N{LATIN CAPITAL LETTER G WITH BREVE}', b'\\u G') self.register(u'\N{LATIN SMALL LETTER G WITH BREVE}', b'\\u g') self.register(u'\N{LATIN CAPITAL LETTER G WITH DOT ABOVE}', b'\\.G') self.register(u'\N{LATIN SMALL LETTER G WITH DOT ABOVE}', b'\\.g') self.register(u'\N{LATIN CAPITAL LETTER G WITH CEDILLA}', b'\\c G') self.register(u'\N{LATIN SMALL LETTER G WITH CEDILLA}', b'\\c g') self.register(u'\N{LATIN CAPITAL LETTER H WITH CIRCUMFLEX}', b'\\^H') self.register(u'\N{LATIN SMALL LETTER H WITH CIRCUMFLEX}', b'\\^h') self.register(u'\N{LATIN CAPITAL LETTER I WITH TILDE}', b'\\~I') self.register(u'\N{LATIN SMALL LETTER I WITH TILDE}', b'\\~\\i') self.register(u'\N{LATIN SMALL LETTER I WITH TILDE}', b'\\~i') self.register(u'\N{LATIN CAPITAL LETTER I WITH MACRON}', b'\\=I') self.register(u'\N{LATIN SMALL LETTER I WITH MACRON}', b'\\=\\i') self.register(u'\N{LATIN SMALL LETTER I WITH MACRON}', b'\\=i') self.register(u'\N{LATIN CAPITAL LETTER I WITH BREVE}', b'\\u I') self.register(u'\N{LATIN SMALL LETTER I WITH BREVE}', b'\\u\\i') self.register(u'\N{LATIN SMALL LETTER I WITH BREVE}', b'\\u i') self.register(u'\N{LATIN CAPITAL LETTER I WITH OGONEK}', b'\\k I') self.register(u'\N{LATIN SMALL LETTER I WITH OGONEK}', b'\\k i') self.register(u'\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}', b'\\.I') self.register(u'\N{LATIN SMALL LETTER DOTLESS I}', b'\\i') self.register(u'\N{LATIN CAPITAL LIGATURE IJ}', b'IJ', decode=False) self.register(u'\N{LATIN SMALL LIGATURE IJ}', b'ij', decode=False) self.register(u'\N{LATIN CAPITAL LETTER J WITH CIRCUMFLEX}', b'\\^J') self.register(u'\N{LATIN SMALL LETTER J WITH CIRCUMFLEX}', b'\\^\\j') self.register(u'\N{LATIN SMALL LETTER J WITH CIRCUMFLEX}', b'\\^j') self.register(u'\N{LATIN CAPITAL LETTER K WITH CEDILLA}', b'\\c K') self.register(u'\N{LATIN SMALL LETTER K WITH CEDILLA}', b'\\c k') self.register(u'\N{LATIN CAPITAL LETTER L WITH ACUTE}', b"\\'L") self.register(u'\N{LATIN SMALL LETTER L WITH ACUTE}', b"\\'l") self.register(u'\N{LATIN CAPITAL LETTER L WITH CEDILLA}', b'\\c L') self.register(u'\N{LATIN SMALL LETTER L WITH CEDILLA}', b'\\c l') self.register(u'\N{LATIN CAPITAL LETTER L WITH CARON}', b'\\v L') self.register(u'\N{LATIN SMALL LETTER L WITH CARON}', b'\\v l') self.register(u'\N{LATIN CAPITAL LETTER L WITH STROKE}', b'\\L') self.register(u'\N{LATIN SMALL LETTER L WITH STROKE}', b'\\l') self.register(u'\N{LATIN CAPITAL LETTER N WITH ACUTE}', b"\\'N") self.register(u'\N{LATIN SMALL LETTER N WITH ACUTE}', b"\\'n") self.register(u'\N{LATIN CAPITAL LETTER N WITH CEDILLA}', b'\\c N') self.register(u'\N{LATIN SMALL LETTER N WITH CEDILLA}', b'\\c n') self.register(u'\N{LATIN CAPITAL LETTER N WITH CARON}', b'\\v N') self.register(u'\N{LATIN SMALL LETTER N WITH CARON}', b'\\v n') self.register(u'\N{LATIN CAPITAL LETTER O WITH MACRON}', b'\\=O') self.register(u'\N{LATIN SMALL LETTER O WITH MACRON}', b'\\=o') self.register(u'\N{LATIN CAPITAL LETTER O WITH BREVE}', b'\\u O') self.register(u'\N{LATIN SMALL LETTER O WITH BREVE}', b'\\u o') self.register( u'\N{LATIN CAPITAL LETTER O WITH DOUBLE ACUTE}', b'\\H O') self.register(u'\N{LATIN SMALL LETTER O WITH DOUBLE ACUTE}', b'\\H o') self.register(u'\N{LATIN CAPITAL LIGATURE OE}', b'\\OE') self.register(u'\N{LATIN SMALL LIGATURE OE}', b'\\oe') self.register(u'\N{LATIN CAPITAL LETTER R WITH ACUTE}', b"\\'R") self.register(u'\N{LATIN SMALL LETTER R WITH ACUTE}', b"\\'r") self.register(u'\N{LATIN CAPITAL LETTER R WITH CEDILLA}', b'\\c R') self.register(u'\N{LATIN SMALL LETTER R WITH CEDILLA}', b'\\c r') self.register(u'\N{LATIN CAPITAL LETTER R WITH CARON}', b'\\v R') self.register(u'\N{LATIN SMALL LETTER R WITH CARON}', b'\\v r') self.register(u'\N{LATIN CAPITAL LETTER S WITH ACUTE}', b"\\'S") self.register(u'\N{LATIN SMALL LETTER S WITH ACUTE}', b"\\'s") self.register(u'\N{LATIN CAPITAL LETTER S WITH CIRCUMFLEX}', b'\\^S') self.register(u'\N{LATIN SMALL LETTER S WITH CIRCUMFLEX}', b'\\^s') self.register(u'\N{LATIN CAPITAL LETTER S WITH CEDILLA}', b'\\c S') self.register(u'\N{LATIN SMALL LETTER S WITH CEDILLA}', b'\\c s') self.register(u'\N{LATIN CAPITAL LETTER S WITH CARON}', b'\\v S') self.register(u'\N{LATIN SMALL LETTER S WITH CARON}', b'\\v s') self.register(u'\N{LATIN CAPITAL LETTER T WITH CEDILLA}', b'\\c T') self.register(u'\N{LATIN SMALL LETTER T WITH CEDILLA}', b'\\c t') self.register(u'\N{LATIN CAPITAL LETTER T WITH CARON}', b'\\v T') self.register(u'\N{LATIN SMALL LETTER T WITH CARON}', b'\\v t') self.register(u'\N{LATIN CAPITAL LETTER U WITH TILDE}', b'\\~U') self.register(u'\N{LATIN SMALL LETTER U WITH TILDE}', b'\\~u') self.register(u'\N{LATIN CAPITAL LETTER U WITH MACRON}', b'\\=U') self.register(u'\N{LATIN SMALL LETTER U WITH MACRON}', b'\\=u') self.register(u'\N{LATIN CAPITAL LETTER U WITH BREVE}', b'\\u U') self.register(u'\N{LATIN SMALL LETTER U WITH BREVE}', b'\\u u') self.register(u'\N{LATIN CAPITAL LETTER U WITH RING ABOVE}', b'\\r U') self.register(u'\N{LATIN SMALL LETTER U WITH RING ABOVE}', b'\\r u') self.register( u'\N{LATIN CAPITAL LETTER U WITH DOUBLE ACUTE}', b'\\H U') self.register(u'\N{LATIN SMALL LETTER U WITH DOUBLE ACUTE}', b'\\H u') self.register(u'\N{LATIN CAPITAL LETTER U WITH OGONEK}', b'\\k U') self.register(u'\N{LATIN SMALL LETTER U WITH OGONEK}', b'\\k u') self.register(u'\N{LATIN CAPITAL LETTER W WITH CIRCUMFLEX}', b'\\^W') self.register(u'\N{LATIN SMALL LETTER W WITH CIRCUMFLEX}', b'\\^w') self.register(u'\N{LATIN CAPITAL LETTER Y WITH CIRCUMFLEX}', b'\\^Y') self.register(u'\N{LATIN SMALL LETTER Y WITH CIRCUMFLEX}', b'\\^y') self.register(u'\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}', b'\\"Y') self.register(u'\N{LATIN CAPITAL LETTER Z WITH ACUTE}', b"\\'Z") self.register(u'\N{LATIN SMALL LETTER Z WITH ACUTE}', b"\\'z") self.register(u'\N{LATIN CAPITAL LETTER Z WITH DOT ABOVE}', b'\\.Z') self.register(u'\N{LATIN SMALL LETTER Z WITH DOT ABOVE}', b'\\.z') self.register(u'\N{LATIN CAPITAL LETTER Z WITH CARON}', b'\\v Z') self.register(u'\N{LATIN SMALL LETTER Z WITH CARON}', b'\\v z') self.register(u'\N{LATIN CAPITAL LETTER DZ WITH CARON}', b'D\\v Z') self.register( u'\N{LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON}', b'D\\v z') self.register(u'\N{LATIN SMALL LETTER DZ WITH CARON}', b'd\\v z') self.register(u'\N{LATIN CAPITAL LETTER LJ}', b'LJ', decode=False) self.register( u'\N{LATIN CAPITAL LETTER L WITH SMALL LETTER J}', b'Lj', decode=False) self.register(u'\N{LATIN SMALL LETTER LJ}', b'lj', decode=False) self.register(u'\N{LATIN CAPITAL LETTER NJ}', b'NJ', decode=False) self.register( u'\N{LATIN CAPITAL LETTER N WITH SMALL LETTER J}', b'Nj', decode=False) self.register(u'\N{LATIN SMALL LETTER NJ}', b'nj', decode=False) self.register(u'\N{LATIN CAPITAL LETTER A WITH CARON}', b'\\v A') self.register(u'\N{LATIN SMALL LETTER A WITH CARON}', b'\\v a') self.register(u'\N{LATIN CAPITAL LETTER I WITH CARON}', b'\\v I') self.register(u'\N{LATIN SMALL LETTER I WITH CARON}', b'\\v\\i') self.register(u'\N{LATIN CAPITAL LETTER O WITH CARON}', b'\\v O') self.register(u'\N{LATIN SMALL LETTER O WITH CARON}', b'\\v o') self.register(u'\N{LATIN CAPITAL LETTER U WITH CARON}', b'\\v U') self.register(u'\N{LATIN SMALL LETTER U WITH CARON}', b'\\v u') self.register(u'\N{LATIN CAPITAL LETTER G WITH CARON}', b'\\v G') self.register(u'\N{LATIN SMALL LETTER G WITH CARON}', b'\\v g') self.register(u'\N{LATIN CAPITAL LETTER K WITH CARON}', b'\\v K') self.register(u'\N{LATIN SMALL LETTER K WITH CARON}', b'\\v k') self.register(u'\N{LATIN CAPITAL LETTER O WITH OGONEK}', b'\\k O') self.register(u'\N{LATIN SMALL LETTER O WITH OGONEK}', b'\\k o') self.register(u'\N{LATIN SMALL LETTER J WITH CARON}', b'\\v\\j') self.register(u'\N{LATIN CAPITAL LETTER DZ}', b'DZ', decode=False) self.register( u'\N{LATIN CAPITAL LETTER D WITH SMALL LETTER Z}', b'Dz', decode=False) self.register(u'\N{LATIN SMALL LETTER DZ}', b'dz', decode=False) self.register(u'\N{LATIN CAPITAL LETTER G WITH ACUTE}', b"\\'G") self.register(u'\N{LATIN SMALL LETTER G WITH ACUTE}', b"\\'g") self.register(u'\N{LATIN CAPITAL LETTER AE WITH ACUTE}', b"\\'\\AE") self.register(u'\N{LATIN SMALL LETTER AE WITH ACUTE}', b"\\'\\ae") self.register( u'\N{LATIN CAPITAL LETTER O WITH STROKE AND ACUTE}', b"\\'\\O") self.register( u'\N{LATIN SMALL LETTER O WITH STROKE AND ACUTE}', b"\\'\\o") self.register(u'\N{PARTIAL DIFFERENTIAL}', b'\\partial', mode='math') self.register(u'\N{N-ARY PRODUCT}', b'\\prod', mode='math') self.register(u'\N{N-ARY SUMMATION}', b'\\sum', mode='math') self.register(u'\N{SQUARE ROOT}', b'\\surd', mode='math') self.register(u'\N{INFINITY}', b'\\infty', mode='math') self.register(u'\N{INTEGRAL}', b'\\int', mode='math') self.register(u'\N{INTERSECTION}', b'\\cap', mode='math') self.register(u'\N{UNION}', b'\\cup', mode='math') self.register(u'\N{RIGHTWARDS ARROW}', b'\\rightarrow', mode='math') self.register( u'\N{RIGHTWARDS DOUBLE ARROW}', b'\\Rightarrow', mode='math') self.register(u'\N{LEFTWARDS ARROW}', b'\\leftarrow', mode='math') self.register( u'\N{LEFTWARDS DOUBLE ARROW}', b'\\Leftarrow', mode='math') self.register(u'\N{LOGICAL OR}', b'\\vee', mode='math') self.register(u'\N{LOGICAL AND}', b'\\wedge', mode='math') self.register(u'\N{ALMOST EQUAL TO}', b'\\approx', mode='math') self.register(u'\N{NOT EQUAL TO}', b'\\neq', mode='math') self.register(u'\N{LESS-THAN OR EQUAL TO}', b'\\leq', mode='math') self.register(u'\N{GREATER-THAN OR EQUAL TO}', b'\\geq', mode='math') self.register(u'\N{MODIFIER LETTER CIRCUMFLEX ACCENT}', b'\\^{}') self.register(u'\N{CARON}', b'\\v{}') self.register(u'\N{BREVE}', b'\\u{}') self.register(u'\N{DOT ABOVE}', b'\\.{}') self.register(u'\N{RING ABOVE}', b'\\r{}') self.register(u'\N{OGONEK}', b'\\k{}') self.register(u'\N{DOUBLE ACUTE ACCENT}', b'\\H{}') self.register(u'\N{LATIN SMALL LIGATURE FI}', b'fi', decode=False) self.register(u'\N{LATIN SMALL LIGATURE FL}', b'fl', decode=False) self.register(u'\N{LATIN SMALL LIGATURE FF}', b'ff', decode=False) self.register(u'\N{GREEK SMALL LETTER ALPHA}', b'\\alpha', mode='math') self.register(u'\N{GREEK SMALL LETTER BETA}', b'\\beta', mode='math') self.register(u'\N{GREEK SMALL LETTER GAMMA}', b'\\gamma', mode='math') self.register(u'\N{GREEK SMALL LETTER DELTA}', b'\\delta', mode='math') self.register( u'\N{GREEK SMALL LETTER EPSILON}', b'\\epsilon', mode='math') self.register(u'\N{GREEK SMALL LETTER ZETA}', b'\\zeta', mode='math') self.register(u'\N{GREEK SMALL LETTER ETA}', b'\\eta', mode='math') self.register(u'\N{GREEK SMALL LETTER THETA}', b'\\theta', mode='math') self.register(u'\N{GREEK SMALL LETTER IOTA}', b'\\iota', mode='math') self.register(u'\N{GREEK SMALL LETTER KAPPA}', b'\\kappa', mode='math') self.register( u'\N{GREEK SMALL LETTER LAMDA}', b'\\lambda', mode='math') # LAMDA not LAMBDA self.register(u'\N{GREEK SMALL LETTER MU}', b'\\mu', mode='math') self.register(u'\N{GREEK SMALL LETTER NU}', b'\\nu', mode='math') self.register(u'\N{GREEK SMALL LETTER XI}', b'\\xi', mode='math') self.register( u'\N{GREEK SMALL LETTER OMICRON}', b'\\omicron', mode='math') self.register(u'\N{GREEK SMALL LETTER PI}', b'\\pi', mode='math') self.register(u'\N{GREEK SMALL LETTER RHO}', b'\\rho', mode='math') self.register(u'\N{GREEK SMALL LETTER SIGMA}', b'\\sigma', mode='math') self.register(u'\N{GREEK SMALL LETTER TAU}', b'\\tau', mode='math') self.register( u'\N{GREEK SMALL LETTER UPSILON}', b'\\upsilon', mode='math') self.register(u'\N{GREEK SMALL LETTER PHI}', b'\\phi', mode='math') self.register(u'\N{GREEK PHI SYMBOL}', b'\\varphi', mode='math') self.register(u'\N{GREEK SMALL LETTER CHI}', b'\\chi', mode='math') self.register(u'\N{GREEK SMALL LETTER PSI}', b'\\psi', mode='math') self.register(u'\N{GREEK SMALL LETTER OMEGA}', b'\\omega', mode='math') self.register( u'\N{GREEK CAPITAL LETTER ALPHA}', b'\\Alpha', mode='math') self.register(u'\N{GREEK CAPITAL LETTER BETA}', b'\\Beta', mode='math') self.register( u'\N{GREEK CAPITAL LETTER GAMMA}', b'\\Gamma', mode='math') self.register( u'\N{GREEK CAPITAL LETTER DELTA}', b'\\Delta', mode='math') self.register( u'\N{GREEK CAPITAL LETTER EPSILON}', b'\\Epsilon', mode='math') self.register(u'\N{GREEK CAPITAL LETTER ZETA}', b'\\Zeta', mode='math') self.register(u'\N{GREEK CAPITAL LETTER ETA}', b'\\Eta', mode='math') self.register( u'\N{GREEK CAPITAL LETTER THETA}', b'\\Theta', mode='math') self.register(u'\N{GREEK CAPITAL LETTER IOTA}', b'\\Iota', mode='math') self.register( u'\N{GREEK CAPITAL LETTER KAPPA}', b'\\Kappa', mode='math') self.register( u'\N{GREEK CAPITAL LETTER LAMDA}', b'\\Lambda', mode='math') # LAMDA not LAMBDA self.register(u'\N{GREEK CAPITAL LETTER MU}', b'\\Mu', mode='math') self.register(u'\N{GREEK CAPITAL LETTER NU}', b'\\Nu', mode='math') self.register(u'\N{GREEK CAPITAL LETTER XI}', b'\\Xi', mode='math') self.register( u'\N{GREEK CAPITAL LETTER OMICRON}', b'\\Omicron', mode='math') self.register(u'\N{GREEK CAPITAL LETTER PI}', b'\\Pi', mode='math') self.register(u'\N{GREEK CAPITAL LETTER RHO}', b'\\Rho', mode='math') self.register( u'\N{GREEK CAPITAL LETTER SIGMA}', b'\\Sigma', mode='math') self.register(u'\N{GREEK CAPITAL LETTER TAU}', b'\\Tau', mode='math') self.register( u'\N{GREEK CAPITAL LETTER UPSILON}', b'\\Upsilon', mode='math') self.register(u'\N{GREEK CAPITAL LETTER PHI}', b'\\Phi', mode='math') self.register(u'\N{GREEK CAPITAL LETTER CHI}', b'\\Chi', mode='math') self.register(u'\N{GREEK CAPITAL LETTER PSI}', b'\\Psi', mode='math') self.register( u'\N{GREEK CAPITAL LETTER OMEGA}', b'\\Omega', mode='math') self.register(u'\N{COPYRIGHT SIGN}', b'\\copyright') self.register(u'\N{COPYRIGHT SIGN}', b'\\textcopyright') self.register(u'\N{LATIN CAPITAL LETTER A WITH ACUTE}', b"\\'A") self.register(u'\N{LATIN CAPITAL LETTER I WITH ACUTE}', b"\\'I") self.register(u'\N{HORIZONTAL ELLIPSIS}', b'\\ldots') self.register(u'\N{TRADE MARK SIGN}', b'^{TM}', mode='math') self.register( u'\N{TRADE MARK SIGN}', b'\\texttrademark', package='textcomp') # \=O and \=o will be translated into Ō and ō before we can # match the full latex string... so decoding disabled for now self.register(u'Ǭ', br'\textogonekcentered{\=O}', decode=False) self.register(u'ǭ', br'\textogonekcentered{\=o}', decode=False) self.register(u'ℕ', br'\mathbb{N}', mode='math') self.register(u'ℕ', br'\mathbb N', mode='math', decode=False) self.register(u'ℤ', br'\mathbb{Z}', mode='math') self.register(u'ℤ', br'\mathbb Z', mode='math', decode=False) self.register(u'ℚ', br'\mathbb{Q}', mode='math') self.register(u'ℚ', br'\mathbb Q', mode='math', decode=False) self.register(u'ℝ', br'\mathbb{R}', mode='math') self.register(u'ℝ', br'\mathbb R', mode='math', decode=False) self.register(u'ℂ', br'\mathbb{C}', mode='math') self.register(u'ℂ', br'\mathbb C', mode='math', decode=False)
[docs] def register(self, unicode_text, latex_text, mode='text', package=None, decode=True, encode=True): """Register a correspondence between *unicode_text* and *latex_text*. :param str unicode_text: A unicode character. :param bytes latex_text: Its corresponding LaTeX translation. :param str mode: LaTeX mode in which the translation applies (``'text'`` or ``'math'``). :param str package: LaTeX package requirements (currently ignored). :param bool decode: Whether this translation applies to decoding (default: ``True``). :param bool encode: Whether this translation applies to encoding (default: ``True``). """ if mode == 'math': # also register text version self.register(unicode_text, b'$' + latex_text + b'$', mode='text', package=package, decode=decode, encode=encode) self.register(unicode_text, br'\(' + latex_text + br'\)', mode='text', package=package, decode=decode, encode=encode) # XXX for the time being, we do not perform in-math substitutions return if not self.lexer.binary_mode: latex_text = latex_text.decode("ascii") if package is not None: # TODO implement packages pass # tokenize, and register unicode translation self.lexer.reset() self.lexer.state = 'M' tokens = tuple(self.lexer.get_tokens(latex_text, final=True)) if decode: if tokens not in self.unicode_map: self.max_length = max(self.max_length, len(tokens)) self.unicode_map[tokens] = unicode_text # also register token variant with brackets, if appropriate # for instance, "\'{e}" for "\'e", "\c{c}" for "\c c", etc. # note: we do not remove brackets (they sometimes matter, # e.g. bibtex uses them to prevent lower case transformation) if (len(tokens) == 2 and tokens[0].name.startswith('control') and tokens[1].name == 'chars'): alt_tokens = (tokens[0], self.lexer.curlylefttoken, tokens[1], self.lexer.curlyrighttoken) if alt_tokens not in self.unicode_map: self.max_length = max(self.max_length, len(alt_tokens)) self.unicode_map[alt_tokens] = u"{" + unicode_text + u"}" if encode and unicode_text not in self.latex_map: assert len(unicode_text) == 1 self.latex_map[unicode_text] = (latex_text, tokens)
_LATEX_UNICODE_TABLE = LatexUnicodeTable(lexer.LatexIncrementalDecoder()) _ULATEX_UNICODE_TABLE = LatexUnicodeTable( lexer.UnicodeLatexIncrementalDecoder()) # incremental encoder does not need a buffer # but decoder does
[docs]class LatexIncrementalEncoder(lexer.LatexIncrementalEncoder): """Translating incremental encoder for latex. Maintains a state to determine whether control spaces etc. need to be inserted. """ table = _LATEX_UNICODE_TABLE """Translation table.""" def __init__(self, errors='strict'): super(LatexIncrementalEncoder, self).__init__(errors=errors) self.reset()
[docs] def reset(self): super(LatexIncrementalEncoder, self).reset() self.state = 'M'
[docs] def get_space_bytes(self, bytes_): """Inserts space bytes in space eating mode.""" if self.state == 'S': # in space eating mode # control space needed? if bytes_.startswith(self.spacechar): # replace by control space return self.controlspacechar, bytes_[1:] else: # insert space (it is eaten, but needed for separation) return self.spacechar, bytes_ else: return self.emptychar, bytes_
def _get_latex_bytes_tokens_from_char(self, c): # if ascii, try latex equivalents # (this covers \, #, &, and other special LaTeX characters) if ord(c) < 128: try: return self.table.latex_map[c] except KeyError: pass # next, try input encoding try: bytes_ = c.encode(self.inputenc, 'strict') except UnicodeEncodeError: pass else: if self.binary_mode: return bytes_, (lexer.Token(name='chars', text=bytes_),) else: return c, (lexer.Token(name='chars', text=c),) # next, try latex equivalents of common unicode characters try: return self.table.latex_map[c] except KeyError: # translation failed if self.errors == 'strict': raise UnicodeEncodeError( "latex", # codec c, # problematic input 0, 1, # location of problematic character "don't know how to translate {0} into latex" .format(repr(c))) elif self.errors == 'ignore': return self.emptychar, (self.emptytoken,) elif self.errors == 'replace': # use the \\char command # this assumes # \usepackage[T1]{fontenc} # \usepackage[utf8]{inputenc} if self.binary_mode: bytes_ = b'{\\char' + str(ord(c)).encode("ascii") + b'}' else: bytes_ = u'{\\char' + str(ord(c)) + u'}' return bytes_, (lexer.Token(name='chars', text=bytes_),) elif self.errors == 'keep' and not self.binary_mode: return c, (lexer.Token(name='chars', text=c),) else: raise ValueError( "latex codec does not support {0} errors" .format(self.errors))
[docs] def get_latex_bytes(self, unicode_, final=False): if not isinstance(unicode_, string_types): raise TypeError( "expected unicode for encode input, but got {0} instead" .format(unicode_.__class__.__name__)) # convert character by character for pos, c in enumerate(unicode_): bytes_, tokens = self._get_latex_bytes_tokens_from_char(c) space, bytes_ = self.get_space_bytes(bytes_) # update state if tokens[-1].name == 'control_word': # we're eating spaces self.state = 'S' else: self.state = 'M' if space: yield space yield bytes_
[docs]class LatexIncrementalDecoder(lexer.LatexIncrementalDecoder): """Translating incremental decoder for LaTeX.""" table = _LATEX_UNICODE_TABLE """Translation table.""" def __init__(self, errors='strict'): lexer.LatexIncrementalDecoder.__init__(self, errors=errors)
[docs] def reset(self): lexer.LatexIncrementalDecoder.reset(self) self.token_buffer = []
# python codecs API does not support multibuffer incremental decoders
[docs] def getstate(self): raise NotImplementedError
[docs] def setstate(self, state): raise NotImplementedError
[docs] def get_unicode_tokens(self, bytes_, final=False): for token in self.get_tokens(bytes_, final=final): # at this point, token_buffer does not match anything self.token_buffer.append(token) # new token appended at the end, see if we have a match now # note: match is only possible at the *end* of the buffer # because all other positions have already been checked in # earlier iterations for i in range(len(self.token_buffer), 0, -1): last_tokens = tuple(self.token_buffer[-i:]) # last i tokens try: unicode_text = self.table.unicode_map[last_tokens] except KeyError: # no match: continue continue else: # match!! flush buffer, and translate last bit # exclude last i tokens for token in self.token_buffer[:-i]: yield self.decode_token(token) yield unicode_text self.token_buffer = [] break # flush tokens that can no longer match while len(self.token_buffer) >= self.table.max_length: yield self.decode_token(self.token_buffer.pop(0)) # also flush the buffer at the end if final: for token in self.token_buffer: yield self.decode_token(token) self.token_buffer = []
[docs]class LatexCodec(codecs.Codec): IncrementalEncoder = None IncrementalDecoder = None
[docs] def encode(self, unicode_, errors='strict'): """Convert unicode string to LaTeX bytes.""" encoder = self.IncrementalEncoder(errors=errors) return ( encoder.encode(unicode_, final=True), len(unicode_), )
[docs] def decode(self, bytes_, errors='strict'): """Convert LaTeX bytes to unicode string.""" decoder = self.IncrementalDecoder(errors=errors) return ( decoder.decode(bytes_, final=True), len(bytes_), )
class UnicodeLatexIncrementalDecoder(LatexIncrementalDecoder): table = _ULATEX_UNICODE_TABLE binary_mode = False class UnicodeLatexIncrementalEncoder(LatexIncrementalEncoder): table = _ULATEX_UNICODE_TABLE binary_mode = False
[docs]def find_latex(encoding): """Return a :class:`codecs.CodecInfo` instance for the requested LaTeX *encoding*, which must be equal to ``latex``, or to ``latex+<encoding>`` where ``<encoding>`` describes another encoding. """ encoding, _, inputenc_ = encoding.partition(u"+") if not inputenc_: inputenc_ = "ascii" if encoding == "latex": IncEnc = LatexIncrementalEncoder DecEnc = LatexIncrementalDecoder elif encoding == "ulatex": IncEnc = UnicodeLatexIncrementalEncoder DecEnc = UnicodeLatexIncrementalDecoder else: return None class IncrementalEncoder_(IncEnc): inputenc = inputenc_ class IncrementalDecoder_(DecEnc): inputenc = inputenc_ class Codec(LatexCodec): IncrementalEncoder = IncrementalEncoder_ IncrementalDecoder = IncrementalDecoder_ class StreamWriter(Codec, codecs.StreamWriter): pass class StreamReader(Codec, codecs.StreamReader): pass return codecs.CodecInfo( encode=Codec().encode, decode=Codec().decode, incrementalencoder=Codec.IncrementalEncoder, incrementaldecoder=Codec.IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, )