Source code for nti.contentfragments.latex

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Implementations of content fragment transformers for latex.

.. $Id: latex.py 85352 2016-03-26 19:08:54Z carlos.sanchez $
"""

from __future__ import print_function, absolute_import, division
__docformat__ = "restructuredtext en"

logger = __import__('logging').getLogger(__name__)

import re

from zope import component
from zope import interface

from nti.contentfragments.interfaces import ITextLatexEscaper
from nti.contentfragments.interfaces import LatexContentFragment
from nti.contentfragments.interfaces import ILatexContentFragment
from nti.contentfragments.interfaces import IPlainTextContentFragment

# Map from unicode to tex name
_TEX_OPERATORS = [(u'\u00d7', u'\\times'),
                  (u'\u2013', u'-'),
                  (u'\u2212', u'-'),
                  (u'\u2260', u'\\neq'),
                  (u'\u00f7', u'\\div'),
                  (u'\u2026', u'\\ldots '),
                  (u'\u221a', u'\\surd'),  # radicand
                  (u'\u2192', u'\\rightarrow'),
                  (u'\uf0d0', u'\\angle'),
                  (u'\uf044', u'\\triangle'),
                  (u'\u2248', u'\\approx')]
_TEX_OPERATOR_MAP = {ord(_k): _v for _k, _v in _TEX_OPERATORS}

# charmap_extn = {
#   u'\u20ac'.encode('utf8'): r'\euro ',
#   u'\u00bd'.encode('utf8'): r'$\frac{1}{2}$',
#   u'\uf020'.encode('utf8'): " ", # 0xef80a0
#   u'\uf02c'.encode('utf8'): " ", # 0xef80ac
#   u'\uf02f'.encode('utf8'): "/", # 0xef80af
#   u'\uf02e'.encode('utf8'): ".",
#   u'\uf06c'.encode('utf8'): " ", # 0xef80ac
#   u'\u2022'.encode('utf8'): r"*", # 0xe280a2 (bullet)
#   u'\u2212'.encode('utf8'): r"-", # 0xe28892
#   u'\u2264'.encode('utf8'): r"$\le$", # 0xef89a4
#   u'\u2265'.encode('utf8'): r"$\ge$", # 0xef89a5
#   u'\u2248'.encode('utf8'): r"$\approx$", # 0xef8988
#   u'\u221E'.encode('utf8'): r"$\infty$", # 0xef889e
#   u'\u03bc'.encode('utf8'): r'$\mu$', # 0xcebc
#   u'\u03A3'.encode('utf8'): r'$\Sigma$', # 0xcEA3
#   u'\uf032'.encode('utf8'): r'$\prime$', # 0xef80b2
#   u'\u03b1'.encode('utf8'): r'$\alpha$', # 0xceb1
#   u'\u03b2'.encode('utf8'): r'$\beta$', # 0xceb2
#   u'\u03b3'.encode('utf8'): r'$\gamma$', # 0xceb3
#   u'\u03c1'.encode('utf8'): r'$\rho$', # 0xcf81
#   u'\u03c3'.encode('utf8'): r'$\sigma$', # 0xcf83
#   u'\u00ad'.encode('utf8'): r'', # 0xc2ad (soft hyphen)
#   u'\u03A0'.encode('utf8'): r'$\Pi$', # 0xc2A0
#   u'\u0394'.encode('utf8'): r'$\Deltae$', # 0xce94
#   u'\u00b5'.encode('utf8'): r'$\mu$', # 0xc2ad (soft hyphen)
#   # This is actually the CENT SIGN, but in the symbol font
#   # it comes in as prime.
#   u'\u00a2'.encode('utf8'): r'$\prime$', # 0xc2c2

_escapes = [(u'$', u'\\$'),
            (u'%', u'\\%'),
            (u'\xa2', u'$\\prime$'),  # \uf0
            (u'\xad', u''),
            (u'\xb5', u'$\\mu$'),
            (u'\xbd', u'$\\frac{1}{2}$'),
            (u'\xd7', u'$\\times$'),
            (u'\xf7', u'$\\div$'),
            (u'\u0394', u'$\\Delta$'),
            (u'\u03a0', u'$\\Pi$'),
            (u'\u03a3', u'$\\Sigma$'),
            (u'\u03b1', u'$\\alpha$'),
            (u'\u03b2', u'$\\beta$'),
            (u'\u03b3', u'$\\gamma$'),
            (u'\u03bc', u'$\\mu$'),
            (u'\u03c0', u'$\\pi$'),
            (u'\u03c1', u'$\\rho$'),
            (u'\u03c3', u'$\\sigma$'),
            (u'\u2013', u'-'),
            (u'\u2014', u'---'),
            (u'\u2019', u"'"),
            (u'\u201c', u'``'),
            (u'\u201d', u"''"),
            (u'\u2022', u'*'),
            # JAM: Why is this commented out? It has been since the
            # first revision of this file, but it seems valid
            # (u'\u2026', u'$\\ldots$'),
            (u'\u20ac', u'\\euro '),
            (u'\u2192', u'$\\rightarrow$'),
            (u'\u2212', u'-'),
            (u'\u2212', u'-'),
            (u'\u221a', u'$\\surd$'),
            (u'\u221e', u'$\\infty$'),
            (u'\u2248', u'$\\approx$'),
            (u'\u2248', u'$\\approx$'),
            (u'\u2260', u'$\\neq$'),
            (u'\u2264', u'$\\le$'),
            (u'\u2265', u'$\\ge$'),
            (u'\uf020', u' '),
            (u'\uf02c', u' '),
            (u'\uf02e', u'.'),
            (u'\uf02f', u'/'),
            (u'\uf032', u'$\\prime$'),
            (u'\uf044', u'$\\triangle$'),
            (u'\uf06c', u' '),
            (u'\uf0d0', u'$\\angle$'),
            (u'. . .', u'\\ldots '),
            (u'\u2026', u'\\ldots '),
            (u'\u00A7', u'\\S')]

def _escape_tex(text):
    escaped_text = text
    for escape in _escapes:
        escaped_text = escaped_text.replace(escape[0], escape[1])
    return escaped_text

@interface.implementer(ITextLatexEscaper)
class _DefaultTextLatexEscaper(object):

    __slots__ = ()

    def __call__(self, text):
        return _escape_tex(text)

[docs]def escape_tex(text, name=u''):
    scaper = component.queryUtility(ITextLatexEscaper, name=name)
    scaper = _escape_tex if scaper is None else scaper
    return scaper(text)

_PLAIN_BINARY_OPS = (u'+', u'-', u'*', u'/', u'=', u'<', u'>', u'\u2260')
_UNICODE_OPS = [_x[0] for _x in _TEX_OPERATORS]

_PLAIN_ACCEPTS = (u'(', u')')

_naturalNumberPattern = re.compile(u'^[0-9]+[.?,]?$')  # Optional trailing punctuation
_realNumberPattern = re.compile(u'^[0-9]*\\.[0-9]*[.?,]?$')  # Optional trailing punctuation
_SIMPLE_ALGEBRA_TERM_PAT = re.compile(r"^[0-9]+\.?[0-9]*[b-zB-Z" + '\u03C0]$')
_PRE_SIMPLE_ALGEBRA_TERM_PAT = re.compile(r"^[a-zA-Z][0-9]+\.?[0-9]*$")
_SIMPLE_ALGEBRA_VAR = re.compile(u'^[a-zA-Z]$')

_TRAILING_PUNCT = (u',', u'.', u'?')

[docs]def is_equation_component(token):
    if not token:
        return token  # False for empty tokens
    return (token in _PLAIN_BINARY_OPS
            # Match '('
            or token in _PLAIN_ACCEPTS
            # Match '(7'
            or (token.startswith(u'(') and is_equation_component(token[1:]))
            # Match '7)'
            or (token.endswith(u')') and is_equation_component(token[0:-1]))
            or (token[-1] in _TRAILING_PUNCT and is_equation_component(token[0:-1]))
            or token in _UNICODE_OPS
            or _naturalNumberPattern.match(token)
            or _realNumberPattern.match(token)
            or _SIMPLE_ALGEBRA_TERM_PAT.match(token)
            or _PRE_SIMPLE_ALGEBRA_TERM_PAT.match(token)
            or _SIMPLE_ALGEBRA_VAR.match(token))

[docs]def cleanup_equation_tokens(tokens):
    """
    Perform cleanups on the individual tokens that make up an
    equation before converting it to string form.

    :return: A 3-tuple: (before string, tokens, after_string)
    """
    # This is a partial implementation that grows as needed
    if tokens[-1][-1] in _TRAILING_PUNCT:
        punct = tokens[-1][-1]
        tokens = list(tokens)
        tokens[-1] = tokens[-1][0:-1]
        return (u'', tokens, punct)
    return (u'', tokens, u'')

[docs]@interface.implementer(ILatexContentFragment)
@component.adapter(IPlainTextContentFragment)
def PlainTextToLatexFragmentConverter(plain_text, text_scaper=u''):
    """
    Attempt to convert plain-text strings into LaTeX strings
    by detecting equations/expressions that could be rendered in
    latex markup.
    """
    # We do a crappy job of trying to parse out expression-like things
    # with a hand-rolled parser. There are certainly better ways. One might
    # be to extract the math parsing algorithm from plasTeX; we'd still have to
    # figure out what makes sense, though

    # SAJ: Before we do anything test and see if we were give a run of pure white
    # space.  If so, just return what we were given.

    if plain_text.isspace():
        return LatexContentFragment(plain_text)

    # First, replace some whitespace sensitive tokens
    plain_text = plain_text.replace(u'. . .', u'\u2026')  # Ellipsis

    # Then, tokenize on whitespace. If the math is poorly delimited, this
    # will fail
    tokens = plain_text.split()

    # Run through until we find an operator. Back up while the previous
    # tokens are numbers. Go forward while the tokens are numbers or operators.
    # repeat until we have consumed all the tokens
    accum = []

    # Each time through the loop we'll either consume an equation and everything
    # before it, or we'll take no action. When we reach the end naturally,
    # everything left is not an equation
    i = 0
    while i < len(tokens):
        if tokens[i] in _PLAIN_BINARY_OPS:
            pointer = i - 1
            while pointer >= 0:
                if is_equation_component(tokens[pointer]):
                    pointer -= 1
                else:
                    break
            if pointer == i - 1:
                # We didn't move backwards at all. This is not part of an equation
                i += 1
                continue
            beginning = pointer + 1  # We moved the cursor before the beginning
            pointer = i + 1
            while pointer < len(tokens):
                token = tokens[pointer]
                if is_equation_component(token):
                    pointer += 1
                    if token[-1] in _TRAILING_PUNCT:
                        break
                else:
                    break
            if pointer == i + 1:
                # We didn't move forwards at all. Hmm. A dangling
                # part of an equation.
                i += 1
                continue
            end = pointer
            eq_tokens = tokens[beginning:end]
            bef, eq_tokens, aft = cleanup_equation_tokens(eq_tokens)
            eq = u' '.join(eq_tokens)
            eq = eq.translate(_TEX_OPERATOR_MAP)
            eq = bef + u'$' + eq + u'$' + aft

            # Everything before us goes in the accumulator
            accum.extend([escape_tex(x, name=text_scaper) for x in tokens[0:beginning]])
            # and then us
            accum.append(eq)
            # and now we can remove the beginning and start over
            del tokens[0:end]
            i = 0
        else:
            # Not a constituent, go forward
            i += 1

    # Any tokens left go in the accumulator
    accum.extend([escape_tex(x, name=text_scaper) for x in tokens])

    # SAJ: If the fragment starts or ends with a space, respect that
    if plain_text and plain_text[0].isspace():
        accum.insert(0, u'')

    if plain_text and plain_text[-1].isspace():
        accum.append(u'')

    result = LatexContentFragment(u' '.join(accum))
    return result