Source code for nti.contentfragments.interfaces

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Content-related interfaces.
"""

from __future__ import print_function, absolute_import, division
__docformat__ = "restructuredtext en"
import sys
logger = __import__('logging').getLogger(__name__)
# pylint:disable=inherit-non-class,too-many-ancestors,no-self-argument,abstract-method
# pylint:disable=useless-object-inheritance
PY2 = str is bytes
PYPY = hasattr(sys, 'pypy_version_info')
PYPY2 = PY2 and PYPY
if PY2: # pragma: no cover
    import copy_reg # pylint:disable=import-error
    text_type = unicode # pylint:disable=undefined-variable
else:
    import copyreg as copy_reg
    text_type = str

from zope import component
from zope import interface

from zope.interface.common.collections import ISequence
from zope.interface.common.builtins import INativeString
from zope.interface.common.builtins import IByteString
from zope.interface.common.builtins import ITextString

from zope.contenttype import add_files as zc_add_files

from zope.mimetype import mtypes as mime_types

from zope.schema import NativeStringLine

from nti.schema.field import IndexedIterable

mime_types.setup()  # register interface classes and utilities if not already

resource_filename = __import__('pkg_resources').resource_filename


def _setup():
    types_data = resource_filename('nti.contentfragments', "types.csv")
    # Hmm. So this registers things in the zope.mimetype.types module
    # The ZCML directive registers them in the specified module (I think)
    # But we can't use that directive because we need them now in order to
    # implement them.
    data = mime_types.read(types_data)
    ifs = mime_types.getInterfaces(data)
    mime_types.registerUtilities(ifs, data)

    mime_map_file = resource_filename('nti.contentfragments', 'mime.types')
    zc_add_files([mime_map_file])
_setup()

# BWC aliases. These will be removed in the future.
IString = INativeString
IUnicode = ITextString
IBytes = IByteString


[docs]class IContentFragment(interface.Interface):
    """
    Base interface representing different formats that content can
    be in.
    """

[docs]class IUnicodeContentFragment(IContentFragment, ISequence):
    """
    Content represented as a unicode string.

    Although it is simplest to subclass :class:`unicode`, that is not required.
    At a minimum, what is required are the `__getitem__` method (and others
    declared by :class:`IReadSequence`), plus the `encode` method.

    .. versionchanged:: 1.3.0
       Extend ``zope.interface.common.collections.ISequence`` instead of the semi-deprecated
       ``zope.interface.common.sequence.IReadSequence``. Except on PyPy2, where
       ``ISequence`` cannot validate against unicode objects.
    """
    # TODO: extend IUnicode?

if PYPY2: # pragma: no cover
    IUnicodeContentFragment.__bases__ = tuple(
        x
        for x in IUnicodeContentFragment.__bases__
        if x is not ISequence
    )


[docs]@interface.implementer(IUnicodeContentFragment)
class UnicodeContentFragment(text_type):
    """
    Subclasses should override the :meth:`__add__` method
    to return objects that implement the appropriate (most derived, generally)
    interface.

    This object *DOES NOT* add a dictionary to the :class:`unicode` type.
    In particular, it should not be weak referenced. Subclasses that
    do not expect to be persisted in the ZODB *may* add additional attributes
    by adding to the ``__slots__`` field (not the instance value).
    """

    # We do need to allow the things used by zope.interface/zope.component
    _ZCA_KEYS = ('__provides__',)

    __slots__ = _ZCA_KEYS  # actually meaningless, but we simulate this with __getattr__ and __setattr__

    def __getattr__(self, name):
        raise AttributeError(name)

    def __setattr__(self, name, value):
        # We do allow the attributes used by the ZCA
        if name in type(self).__slots__:
            super(UnicodeContentFragment, self).__setattr__(name, value)
            return
        raise AttributeError(name, type(self))

    def __getattribute__(self, name):
        if name in ('__dict__', '__weakref__'):  # Though this does not actually prevent creating a weak ref
            raise AttributeError(name, type(self))
        if name == '__class__':
            return type(self)
        return text_type.__getattribute__(self, name)

    def __setstate__(self, state):
        # If we had any state saved due to bad pickles in the past
        # ignore it. Do support the ZCA attributes
        if state:
            for k in self.__slots__:
                v = state.pop(k, self)
                if v is not self:
                    text_type.__setattr__(self, k, v)
            # Anything left is bad and not supported. __parent__ was extremely common at one point
            if state and (len(state) > 1 or '__parent__' not in state):
                logger.warning("Ignoring bad state for %s: %s", self, state)

    def __getstate__(self):
        # Support just the ZCA attributes
        try:
            state = text_type.__getattribute__(self, '__dict__')
        except AttributeError:
            # Hmm, really is a slot
            try:
                state = {'__provides__': self.__provides__}
            except AttributeError:
                state = None
        if state:
            state = {k: v for k, v in state.items() if k in type(self).__slots__}
            return state

        return ()

    def __reduce_ex__(self, protocol):
        return (copy_reg.__newobj__,  # Constructor
                # Constructor args. Note we pass a real base unicode object;
                # otherwise, we get infinite recursion as pickle tries to
                # reduce use again using __unicode__
                (type(self), self.encode('utf-8').decode('utf-8')),
                self.__getstate__() or None,
                None,
                None)

    def __unicode__(self):
        """"
        We are-a unicode instance, but if we don't override this method,
        calling unicode(UnicodeContentFragment('')) produces a plain, base,
        unicode object, thus losing all our interfaces.
        """
        return self

    if not PY2:
        __str__ = __unicode__

        def __getslice__(self, i, j):
            # Part of IReadSequence, deprecated in 2.0, removed in 3,
            # but we still must implement it to comply with the iface.
            raise NotImplementedError()

    def __rmul__(self, times):
        result = text_type.__rmul__(self, times)
        if result is not self:
            result = self.__class__(result)
        return result

    def __mul__(self, times):
        result = text_type.__mul__(self, times)
        if result is not self:
            result = self.__class__(result)
        return result

[docs]    def translate(self, table):
        result = text_type.translate(self, table)
        if result is not self:
            result = self.__class__(result)
        return result

[docs]    def lower(self):
        result = text_type.lower(self)
        if result == self:
            return self  # NOTE this is slightly different than what a normal string does
        return self.__class__(result)

[docs]    def upper(self):
        result = text_type.upper(self)
        if result == self:
            return self  # NOTE this is slightly different than what a normal string does
        return self.__class__(result)

    # shut pylint up about 'bad container'; raise same error super does
    def __delitem__(self, i):
        raise TypeError()

    def __setitem__(self, k, v):
        raise TypeError()


IContentTypeTextLatex = getattr(mime_types, 'IContentTypeTextLatex')
[docs]class ILatexContentFragment(IUnicodeContentFragment, IContentTypeTextLatex):
    """
    Interface representing content in LaTeX format.
    """


[docs]@interface.implementer(ILatexContentFragment)
class LatexContentFragment(UnicodeContentFragment):
    pass


IContentTypeTextHtml = getattr(mime_types, 'IContentTypeTextHtml')
[docs]class IHTMLContentFragment(IUnicodeContentFragment, IContentTypeTextHtml):
    """
    Interface representing content in HTML format.
    """


IContentTypeTextRst = getattr(mime_types, 'IContentTypeTextRst')
[docs]class IRstContentFragment(IUnicodeContentFragment, IContentTypeTextRst):
    """
    Interface representing content in RST format.
    """


[docs]@interface.implementer(IRstContentFragment)
class RstContentFragment(UnicodeContentFragment):
    pass


# NOTE The implementations of the add methods go directly to
# unicode and not up the super() chain to avoid as many extra
# copies as possible

def _add_(self, other, tuples):
    result = text_type.__add__(self, other)
    for pair in tuples:
        if pair[0].providedBy(other):
            result = pair[1](result)
            break
    return result


class _AddMixin(object):
    _add_rules = ()

    def __add__(self, other):
        return _add_(self, other, self._add_rules)


[docs]@interface.implementer(IHTMLContentFragment)
class HTMLContentFragment(_AddMixin, UnicodeContentFragment):
    pass

HTMLContentFragment._add_rules = ((IHTMLContentFragment, HTMLContentFragment),)

[docs]class ISanitizedHTMLContentFragment(IHTMLContentFragment):
    """
    HTML content, typically of unknown or untrusted provenance,
    that has been sanitized for "safe" presentation in a generic,
    also unknown browsing context.
    Typically this will mean that certain unsafe constructs, such
    as <script> tags have been removed.
    """

[docs]@interface.implementer(ISanitizedHTMLContentFragment)
class SanitizedHTMLContentFragment(HTMLContentFragment):
    pass

# TODO: What about the rules for the other types?
SanitizedHTMLContentFragment._add_rules = \
    ((ISanitizedHTMLContentFragment, SanitizedHTMLContentFragment),) + \
    HTMLContentFragment._add_rules

IContentTypeTextPlain = getattr(mime_types, 'IContentTypeTextPlain')
[docs]class IPlainTextContentFragment(IUnicodeContentFragment, IContentTypeTextPlain):
    """
    Interface representing content in plain text format.
    """

[docs]@interface.implementer(IPlainTextContentFragment)
class PlainTextContentFragment(UnicodeContentFragment):
    pass

@interface.implementer(IPlainTextContentFragment)
@component.adapter(IPlainTextContentFragment)
def _plain_text_to_plain_text(text):
    # We shouldn't actually be able to get here.
    return text # pragma: no cover

from zope.schema.interfaces import ITokenizedTerm

[docs]class ICensoredTerm(ITokenizedTerm):
    """
    Base interface for a censored term
    """

[docs]class IProfanityTerm(ICensoredTerm):
    """
    Base interface for a profanity term
    """

[docs]class ICensoredUnicodeContentFragment(IUnicodeContentFragment):
    """
    A content fragment that has passed through a censoring process to
    attempt to ensure it is safe for display to its intended audience (e.g.,
    profanity has been removed if the expected audience is underage/sensitive to
    that).

    The rules for censoring content will be very context specific. In
    particular, it will depend on *who* you are, and *where* you are
    adding/editing content. The *who* is important to differentiate
    between, e.g., students and teachers. The *where* is important to
    differentiate between, say, a public forum, and your private notes, or
    between your Human Sexuality textbook and your Calculus textbook.

    For this reason, the censoring process will typically utilize
    multi-adapters registered on (creator, content_unit). Contrast this with
    sanitizing HTML, which always follows the same process.
    """

[docs]@interface.implementer(ICensoredUnicodeContentFragment)
class CensoredUnicodeContentFragment(_AddMixin, UnicodeContentFragment):
    pass

CensoredUnicodeContentFragment._add_rules = (
    (ICensoredUnicodeContentFragment, CensoredUnicodeContentFragment),
    (IUnicodeContentFragment, UnicodeContentFragment)
)

[docs]class ICensoredPlainTextContentFragment(IPlainTextContentFragment, ICensoredUnicodeContentFragment):
    pass

[docs]@interface.implementer(ICensoredPlainTextContentFragment)
class CensoredPlainTextContentFragment(PlainTextContentFragment):
    pass

PlainTextContentFragment.censored = lambda s, n: CensoredPlainTextContentFragment(n)
CensoredPlainTextContentFragment.censored = lambda s, n: CensoredPlainTextContentFragment(n)

[docs]class ICensoredHTMLContentFragment(IHTMLContentFragment, ICensoredUnicodeContentFragment):
    pass

[docs]@interface.implementer(ICensoredHTMLContentFragment)
class CensoredHTMLContentFragment(HTMLContentFragment):
    pass

CensoredHTMLContentFragment._add_rules = \
    ((ICensoredHTMLContentFragment, CensoredHTMLContentFragment),) + \
    CensoredUnicodeContentFragment._add_rules
CensoredHTMLContentFragment.censored = lambda s, n: CensoredHTMLContentFragment(n)

[docs]class ICensoredSanitizedHTMLContentFragment(ISanitizedHTMLContentFragment, ICensoredHTMLContentFragment):
    pass

[docs]@interface.implementer(ICensoredSanitizedHTMLContentFragment)
class CensoredSanitizedHTMLContentFragment(CensoredHTMLContentFragment):
    pass

# The rules here place sanitization ahead of censoring, because sanitization
# can cause security problems for end users; censoring is just offensive
CensoredSanitizedHTMLContentFragment._add_rules = (
    ((ICensoredSanitizedHTMLContentFragment, CensoredSanitizedHTMLContentFragment),
     (ISanitizedHTMLContentFragment, SanitizedHTMLContentFragment),)
    + CensoredHTMLContentFragment._add_rules
    + HTMLContentFragment._add_rules
)

HTMLContentFragment.censored = lambda s, n: CensoredHTMLContentFragment(n)
UnicodeContentFragment.censored = lambda s, n: CensoredUnicodeContentFragment(n)
SanitizedHTMLContentFragment.censored = lambda s, n: CensoredSanitizedHTMLContentFragment(n)
CensoredSanitizedHTMLContentFragment.censored = lambda s, n: CensoredSanitizedHTMLContentFragment(n)

# See http://code.google.com/p/py-contentfilter/
# and https://hkn.eecs.berkeley.edu/~dyoo/python/ahocorasick/

[docs]class ICensoredContentScanner(interface.Interface):
    """
    Something that can perform censoring.

    Variations of censoring scanners will be registered
    as named utilities. Particular censoring solutions (the adapters discussed
    in :class:`ICensoredUnicodeContentFragment`) will put together
    a combination of these utilities to produce the desired result.

    The censoring process can further be broken down into two parts:
    detection of unwanted content, and reacting to unwanted content. For example,
    reacting might consist of replacing the content with asterisks in plain text,
    or a special span in HTML, or it might throw an exception to disallow the content
    altogether. This object performs the first part.

    The names may be something like MPAA ratings, or they may follow other categories.
    """

    def scan(content_fragment):
        """
        Scan the given content fragment for censored terms and return
        their positions as a sequence (iterator) of two-tuples (start,
        end). The returned tuples should be non-overlapping.
        """

[docs]class ICensoredContentStrategy(interface.Interface):
    """
    The other half of the content censoring process explained in
    :class:`ICensoredContentScanner`, responsible for taking action
    on censoring content.
    """

    def censor_ranges(content_fragment, censored_ranges):
        """
        Censors the content fragment appropriately and returns the censored value.

        :param content_fragment: The fragment being censored.
        :param censored_ranges: The ranges of illicit content as produced by
            :meth:`ICensoredContentScanner.scan`; they are not guaranteed to be in any
            particular order so you may need to sort them with :func:`sorted` (in reverse)
        :return: The censored content fragment, if any censoring was done to it.
            May also raise a :class:`ValueError` if censoring is not
            allowed and the content should be thrown away.
        """

[docs]class ICensoredContentPolicy(interface.Interface):
    """
    A top-level policy puts together detection of content ranges
    to censor with a strategy to censor them
    """

    def censor(content_fragment, context):
        """
        Censors the content fragment appropriately and returns the censored value.

        :param content_fragment: The fragment being censored.
        :param context: The object that this content fragment should be censored
            with regard to. For example, the fragment's container or composite
            object that will hold the fragment.
        :return: The censored content fragment, if any censoring was done to it.
            May also raise a :class:`ValueError` if censoring is not
            allowed and the content should be thrown away.
        """

[docs]class IHyperlinkFormatter(interface.Interface):

    def find_links(text):
        """
        Given a string of `text`, look through it for hyperlinks and find them.

        :return: A sequence of strings and `lxml.etree.Element` objects representing
            the plain text and detected links, in order, within the given text.
        """

    def format(html_fragment):
        """
        Process the specified ``IHTMLContentFragment`` and scan through and convert any
        plain text links recognized by the this object and inserting new ``<a>`` elements,
        """

[docs]class ICensoredContentEvent(interface.Interface):
    content_fragment = interface.Attribute("The content that was censored")
    censored_content = interface.Attribute("The censored content")
    name = interface.Attribute("The name of the attribute under which the censor content will be assigned.")
    context = interface.Attribute("The context object where the object will be assigned to.")

[docs]@interface.implementer(ICensoredContentEvent)
class CensoredContentEvent(object):

    def __init__(self, content_fragment, censored_content, name=None, context=None):
        self.content_fragment = content_fragment
        self.censored_content = censored_content
        self.name = name
        self.context = context

[docs]class ITextLatexEscaper(interface.Interface):

    def __call_(text):
        """
        scape the specifed text
        """

# Punctuation

[docs]class IPunctuationMarkExpression(interface.Interface):
    """
    marker interface for punctuation regular expression
    """
IPunctuationCharExpression = IPunctuationMarkExpression

[docs]class IPunctuationMarkExpressionPlus(interface.Interface):
    """
    marker interface for punctuation + space regular expression
    """
IPunctuationCharExpressionPlus = IPunctuationMarkExpressionPlus

[docs]class IPunctuationMarkPattern(interface.Interface):
    """
    marker interface for punctuation regular expression pattern
    """
IPunctuationCharPattern = IPunctuationMarkPattern

[docs]class IPunctuationMarkPatternPlus(interface.Interface):
    """
    marker interface for punctuation + space regular expression pattern
    """
IPunctuationCharPatternPlus = IPunctuationMarkPatternPlus

## Schema Fields

from zope.schema.interfaces import IObject
from zope.schema.interfaces import IText
from zope.schema.interfaces import ITextLine

[docs]class ITextUnicodeContentFragmentField(IObject, IText):
    """
    A :class:`zope.schema.Text` type that also requires the object implement
    an interface descending from :class:`~.IUnicodeContentFragment`.

    .. versionadded:: 1.2.0
    """

[docs]class ITextLineUnicodeContentFragmentField(IObject, ITextLine):
    """
    A :class:`zope.schema.TextLine` type that also requires the object implement
    an interface descending from :class:`~.IUnicodeContentFragment`.

    .. versionadded:: 1.2.0
    """

[docs]class ILatexFragmentTextLineField(ITextLineUnicodeContentFragmentField):
    """
    A :class:`~zope.schema.TextLine` that requires content to be in LaTeX format.

    .. versionadded:: 1.2.0
    """


[docs]class IPlainTextLineField(ITextLineUnicodeContentFragmentField):
    """
    A :class:`~zope.schema.TextLine` that requires content to be plain text.
    """


[docs]class IHTMLContentFragmentField(ITextUnicodeContentFragmentField):
    """
    A :class:`~zope.schema.Text` type that also requires the object implement
    an interface descending from :class:`.IHTMLContentFragment`.

    .. versionadded:: 1.2.0
    """


[docs]class ISanitizedHTMLContentFragmentField(IHTMLContentFragmentField):
    """
    A :class:`Text` type that also requires the object implement
    an interface descending from :class:`.ISanitizedHTMLContentFragment`.

    .. versionadded:: 1.2.0
    """

[docs]class IPlainTextField(ITextUnicodeContentFragmentField):
    """
    A :class:`zope.schema.Text` that requires content to be plain text.

    .. versionadded:: 1.2.0
    """


[docs]class IRstContentFragmentField(ITextUnicodeContentFragmentField):
    """
    A :class:`~zope.schema.Text` type that also requires the object implement
    an interface descending from :class:`.IRstContentFragment`.

    .. versionadded:: 1.6.0
    """


[docs]class ITagField(IPlainTextLineField):
    """
    Requires its content to be only one plain text word that is lowercased.

    .. versionadded:: 1.2.0
    """

[docs]class IAllowedAttributeProvider(interface.Interface):
    """
    A way to provide a whitelist of additional attribute names that would be
    allowed while parsing a content fragment, thus extending the attributes
    already allowed.

    .. versionadded:: 1.4.0
    """

    allowed_attributes = IndexedIterable(title=u"An iterable of attribute names allowed in a particular context",
                                         value_type=NativeStringLine(title=u"The attribute name"),
                                         default=(),
                                         required=False)