spaCy/spacy/tokens/token.pyx

# cython: infer_types=True
# coding: utf8
from __future__ import unicode_literals

from libc.string cimport memcpy
from cpython.mem cimport PyMem_Malloc, PyMem_Free
# Compiler crashes on memory view coercion without this. Should report bug.
from cython.view cimport array as cvarray
cimport numpy as np
np.import_array()
import numpy

from ..typedefs cimport hash_t
from ..lexeme cimport Lexeme
from .. import parts_of_speech
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM, LIKE_EMAIL
from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX
from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP
from ..compat import is_config
from ..errors import Errors, Warnings, user_warning, models_warning
from .. import util
from .underscore import Underscore, get_ext_args
from .morphanalysis cimport MorphAnalysis


cdef class Token:
    """An individual token – i.e. a word, punctuation symbol, whitespace,
    etc."""
    @classmethod
    def set_extension(cls, name, **kwargs):
        if cls.has_extension(name) and not kwargs.get('force', False):
            raise ValueError(Errors.E090.format(name=name, obj='Token'))
        Underscore.token_extensions[name] = get_ext_args(**kwargs)

    @classmethod
    def get_extension(cls, name):
        return Underscore.token_extensions.get(name)

    @classmethod
    def has_extension(cls, name):
        return name in Underscore.token_extensions

    @classmethod
    def remove_extension(cls, name):
        if not cls.has_extension(name):
            raise ValueError(Errors.E046.format(name=name))
        return Underscore.token_extensions.pop(name)

    def __cinit__(self, Vocab vocab, Doc doc, int offset):
        """Construct a `Token` object.

        vocab (Vocab): A storage container for lexical types.
        doc (Doc): The parent document.
        offset (int): The index of the token within the document.
        """
        self.vocab = vocab
        self.doc = doc
        self.c = &self.doc.c[offset]
        self.i = offset

    def __hash__(self):
        return hash((self.doc, self.i))

    def __len__(self):
        """The number of unicode characters in the token, i.e. `token.text`.

        RETURNS (int): The number of unicode characters in the token.
        """
        return self.c.lex.length

    def __unicode__(self):
        return self.text

    def __bytes__(self):
        return self.text.encode('utf8')

    def __str__(self):
        if is_config(python3=True):
            return self.__unicode__()
        return self.__bytes__()

    def __repr__(self):
        return self.__str__()

    def __richcmp__(self, Token other, int op):
        # http://cython.readthedocs.io/en/latest/src/userguide/special_methods.html
        if other is None:
            if op in (0, 1, 2):
                return False
            else:
                return True
        cdef Doc my_doc = self.doc
        cdef Doc other_doc = other.doc
        my = self.idx
        their = other.idx
        if op == 0:
            return my < their
        elif op == 2:
            if my_doc is other_doc:
                return my == their
            else:
                return False
        elif op == 4:
            return my > their
        elif op == 1:
            return my <= their
        elif op == 3:
            if my_doc is other_doc:
                return my != their
            else:
                return True
        elif op == 5:
            return my >= their
        else:
            raise ValueError(Errors.E041.format(op=op))

    def __reduce__(self):
        raise NotImplementedError(Errors.E111)

    @property
    def _(self):
        return Underscore(Underscore.token_extensions, self,
                          start=self.idx, end=None)

    cpdef bint check_flag(self, attr_id_t flag_id) except -1:
        """Check the value of a boolean flag.

        flag_id (int): The ID of the flag attribute.
        RETURNS (bool): Whether the flag is set.

        EXAMPLE:
            >>> from spacy.attrs import IS_TITLE
            >>> doc = nlp(u'Give it back! He pleaded.')
            >>> token = doc[0]
            >>> token.check_flag(IS_TITLE)
            True
        """
        return Lexeme.c_check_flag(self.c.lex, flag_id)

    def nbor(self, int i=1):
        """Get a neighboring token.

        i (int): The relative position of the token to get. Defaults to 1.
        RETURNS (Token): The token at position `self.doc[self.i+i]`.
        """
        if self.i+i < 0 or (self.i+i >= len(self.doc)):
            raise IndexError(Errors.E042.format(i=self.i, j=i, length=len(self.doc)))
        return self.doc[self.i+i]

    def similarity(self, other):
        """Make a semantic similarity estimate. The default estimate is cosine
        similarity using an average of word vectors.

        other (object): The object to compare with. By default, accepts `Doc`,
            `Span`, `Token` and `Lexeme` objects.
        RETURNS (float): A scalar similarity score. Higher is more similar.
        """
        if 'similarity' in self.doc.user_token_hooks:
            return self.doc.user_token_hooks['similarity'](self)
        if hasattr(other, '__len__') and len(other) == 1 and hasattr(other, "__getitem__"):
            if self.c.lex.orth == getattr(other[0], 'orth', None):
                return 1.0
        elif hasattr(other, 'orth'):
            if self.c.lex.orth == other.orth:
                return 1.0
        if self.vocab.vectors.n_keys == 0:
            models_warning(Warnings.W007.format(obj='Token'))
        if self.vector_norm == 0 or other.vector_norm == 0:
            user_warning(Warnings.W008.format(obj='Token'))
            return 0.0
        return (numpy.dot(self.vector, other.vector) /
                (self.vector_norm * other.vector_norm))

    property morph:
        def __get__(self):
            return MorphAnalysis.from_id(self.vocab, self.c.morph)

    property lex_id:
        """RETURNS (int): Sequential ID of the token's lexical type."""
        def __get__(self):
            return self.c.lex.id

    property rank:
        """RETURNS (int): Sequential ID of the token's lexical type, used to
        index into tables, e.g. for word vectors."""
        def __get__(self):
            return self.c.lex.id

    property string:
        """Deprecated: Use Token.text_with_ws instead."""
        def __get__(self):
            return self.text_with_ws

    property text:
        """RETURNS (unicode): The original verbatim text of the token."""
        def __get__(self):
            return self.orth_

    property text_with_ws:
        """RETURNS (unicode): The text content of the span (with trailing
            whitespace).
        """
        def __get__(self):
            cdef unicode orth = self.vocab.strings[self.c.lex.orth]
            if self.c.spacy:
                return orth + u' '
            else:
                return orth

    property prob:
        """RETURNS (float): Smoothed log probability estimate of token type."""
        def __get__(self):
            return self.c.lex.prob

    property sentiment:
        """RETURNS (float): A scalar value indicating the positivity or
            negativity of the token."""
        def __get__(self):
            if 'sentiment' in self.doc.user_token_hooks:
                return self.doc.user_token_hooks['sentiment'](self)
            return self.c.lex.sentiment

    property lang:
        """RETURNS (uint64): ID of the language of the parent document's
            vocabulary.
        """
        def __get__(self):
            return self.c.lex.lang

    property idx:
        """RETURNS (int): The character offset of the token within the parent
            document.
        """
        def __get__(self):
            return self.c.idx

    property cluster:
        """RETURNS (int): Brown cluster ID."""
        def __get__(self):
            return self.c.lex.cluster

    property orth:
        """RETURNS (uint64): ID of the verbatim text content."""
        def __get__(self):
            return self.c.lex.orth

    property lower:
        """RETURNS (uint64): ID of the lowercase token text."""
        def __get__(self):
            return self.c.lex.lower

    property norm:
        """RETURNS (uint64): ID of the token's norm, i.e. a normalised form of
            the token text. Usually set in the language's tokenizer exceptions
            or norm exceptions.
        """
        def __get__(self):
            if self.c.norm == 0:
                return self.c.lex.norm
            else:
                return self.c.norm

    property shape:
        """RETURNS (uint64): ID of the token's shape, a transform of the
            tokens's string, to show orthographic features (e.g. "Xxxx", "dd").
        """
        def __get__(self):
            return self.c.lex.shape

    property prefix:
        """RETURNS (uint64): ID of a length-N substring from the start of the
            token. Defaults to `N=1`.
        """
        def __get__(self):
            return self.c.lex.prefix

    property suffix:
        """RETURNS (uint64): ID of a length-N substring from the end of the
            token. Defaults to `N=3`.
        """
        def __get__(self):
            return self.c.lex.suffix

    property lemma:
        """RETURNS (uint64): ID of the base form of the word, with no
            inflectional suffixes.
        """
        def __get__(self):
            if self.c.lemma == 0:
                lemma_ = self.vocab.morphology.lemmatizer.lookup(self.orth_)
                return self.vocab.strings[lemma_]
            else:
                return self.c.lemma

        def __set__(self, attr_t lemma):
            self.c.lemma = lemma

    property pos:
        """RETURNS (uint64): ID of coarse-grained part-of-speech tag."""
        def __get__(self):
            return self.c.pos
        def __set__(self, pos):
            self.c.pos = pos

    property tag:
        """RETURNS (uint64): ID of fine-grained part-of-speech tag."""
        def __get__(self):
            return self.c.tag

        def __set__(self, attr_t tag):
            self.vocab.morphology.assign_tag(self.c, tag)

    property dep:
        """RETURNS (uint64): ID of syntactic dependency label."""
        def __get__(self):
            return self.c.dep

        def __set__(self, attr_t label):
            self.c.dep = label

    property has_vector:
        """A boolean value indicating whether a word vector is associated with
        the object.

        RETURNS (bool): Whether a word vector is associated with the object.
        """
        def __get__(self):
            if 'has_vector' in self.doc.user_token_hooks:
                return self.doc.user_token_hooks['has_vector'](self)
            if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
                return True
            return self.vocab.has_vector(self.c.lex.orth)

    property vector:
        """A real-valued meaning representation.

        RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
            representing the token's semantics.
        """
        def __get__(self):
            if 'vector' in self.doc.user_token_hooks:
                return self.doc.user_token_hooks['vector'](self)
            if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
                return self.doc.tensor[self.i]
            else:
                return self.vocab.get_vector(self.c.lex.orth)

    property vector_norm:
        """The L2 norm of the token's vector representation.

        RETURNS (float): The L2 norm of the vector representation.
        """
        def __get__(self):
            if 'vector_norm' in self.doc.user_token_hooks:
                return self.doc.user_token_hooks['vector_norm'](self)
            vector = self.vector
            return numpy.sqrt((vector ** 2).sum())

    property n_lefts:
        """RETURNS (int): The number of leftward immediate children of the
            word, in the syntactic dependency parse.
        """
        def __get__(self):
            return self.c.l_kids

    property n_rights:
        """RETURNS (int): The number of rightward immediate children of the
            word, in the syntactic dependency parse.
        """
        def __get__(self):
            return self.c.r_kids

    property sent:
        """RETURNS (Span): The sentence span that the token is a part of."""
        def __get__(self):
            if 'sent' in self.doc.user_token_hooks:
                return self.doc.user_token_hooks['sent'](self)
            return self.doc[self.i : self.i+1].sent

    property sent_start:
        def __get__(self):
            # Raising a deprecation warning here causes errors for autocomplete
            # Handle broken backwards compatibility case: doc[0].sent_start
            # was False.
            if self.i == 0:
                return False
            else:
                return self.c.sent_start

        def __set__(self, value):
            self.is_sent_start = value

    property is_sent_start:
        """RETURNS (bool / None): Whether the token starts a sentence.
            None if unknown.
        """
        def __get__(self):
            if self.c.sent_start == 0:
                return None
            elif self.c.sent_start < 0:
                return False
            else:
                return True

        def __set__(self, value):
            if self.doc.is_parsed:
                raise ValueError(Errors.E043)
            if value is None:
                self.c.sent_start = 0
            elif value is True:
                self.c.sent_start = 1
            elif value is False:
                self.c.sent_start = -1
            else:
                raise ValueError(Errors.E044.format(value=value))

    property lefts:
        """The leftward immediate children of the word, in the syntactic
        dependency parse.

        YIELDS (Token): A left-child of the token.
        """
        def __get__(self):
            cdef int nr_iter = 0
            cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge)
            while ptr < self.c:
                if ptr + ptr.head == self.c:
                    yield self.doc[ptr - (self.c - self.i)]
                ptr += 1
                nr_iter += 1
                # This is ugly, but it's a way to guard out infinite loops
                if nr_iter >= 10000000:
                    raise RuntimeError(Errors.E045.format(attr='token.lefts'))

    property rights:
        """The rightward immediate children of the word, in the syntactic
        dependency parse.

        YIELDS (Token): A right-child of the token.
        """
        def __get__(self):
            cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
            tokens = []
            cdef int nr_iter = 0
            while ptr > self.c:
                if ptr + ptr.head == self.c:
                    tokens.append(self.doc[ptr - (self.c - self.i)])
                ptr -= 1
                nr_iter += 1
                if nr_iter >= 10000000:
                    raise RuntimeError(Errors.E045.format(attr='token.rights'))
            tokens.reverse()
            for t in tokens:
                yield t

    property children:
        """A sequence of the token's immediate syntactic children.

        YIELDS (Token): A child token such that child.head==self
        """
        def __get__(self):
            yield from self.lefts
            yield from self.rights

    property subtree:
        """A sequence containing the token and all the token's syntactic
        descendants.

        YIELDS (Token): A descendent token such that
            `self.is_ancestor(descendent) or token == self`.
        """
        def __get__(self):
            for word in self.lefts:
                yield from word.subtree
            yield self
            for word in self.rights:
                yield from word.subtree

    property left_edge:
        """The leftmost token of this token's syntactic descendents.

        RETURNS (Token): The first token such that `self.is_ancestor(token)`.
        """
        def __get__(self):
            return self.doc[self.c.l_edge]

    property right_edge:
        """The rightmost token of this token's syntactic descendents.

        RETURNS (Token): The last token such that `self.is_ancestor(token)`.
        """
        def __get__(self):
            return self.doc[self.c.r_edge]

    property ancestors:
        """A sequence of this token's syntactic ancestors.

        YIELDS (Token): A sequence of ancestor tokens such that
            `ancestor.is_ancestor(self)`.
        """
        def __get__(self):
            cdef const TokenC* head_ptr = self.c
            # guard against infinite loop, no token can have
            # more ancestors than tokens in the tree
            cdef int i = 0
            while head_ptr.head != 0 and i < self.doc.length:
                head_ptr += head_ptr.head
                yield self.doc[head_ptr - (self.c - self.i)]
                i += 1

    def is_ancestor(self, descendant):
        """Check whether this token is a parent, grandparent, etc. of another
        in the dependency tree.

        descendant (Token): Another token.
        RETURNS (bool): Whether this token is the ancestor of the descendant.
        """
        if self.doc is not descendant.doc:
            return False
        return any(ancestor.i == self.i for ancestor in descendant.ancestors)

    property head:
        """The syntactic parent, or "governor", of this token.

        RETURNS (Token): The token predicted by the parser to be the head of
            the current token.
        """
        def __get__(self):
            return self.doc[self.i + self.c.head]

        def __set__(self, Token new_head):
            # this function sets the head of self to new_head
            # and updates the counters for left/right dependents
            # and left/right corner for the new and the old head

            # do nothing if old head is new head
            if self.i + self.c.head == new_head.i:
                return

            cdef Token old_head = self.head
            cdef int rel_newhead_i = new_head.i - self.i

            # is the new head a descendant of the old head
            cdef bint is_desc = old_head.is_ancestor(new_head)

            cdef int new_edge
            cdef Token anc, child

            # update number of deps of old head
            if self.c.head > 0:  # left dependent
                old_head.c.l_kids -= 1
                if self.c.l_edge == old_head.c.l_edge:
                    # the token dominates the left edge so the left edge of
                    # the  head may change when the token is reattached, it may
                    # not change if the new head is a descendant of the current
                    # head

                    new_edge = self.c.l_edge
                    # the new l_edge is the left-most l_edge on any of the
                    # other dependents where the l_edge is left of the head,
                    # otherwise it is the head
                    if not is_desc:
                        new_edge = old_head.i
                        for child in old_head.children:
                            if child == self:
                                continue
                            if child.c.l_edge < new_edge:
                                new_edge = child.c.l_edge
                        old_head.c.l_edge = new_edge

                    # walk up the tree from old_head and assign new l_edge to
                    # ancestors until an ancestor already has an l_edge that's
                    # further left
                    for anc in old_head.ancestors:
                        if anc.c.l_edge <= new_edge:
                            break
                        anc.c.l_edge = new_edge

            elif self.c.head < 0:  # right dependent
                old_head.c.r_kids -= 1
                # do the same thing as for l_edge
                if self.c.r_edge == old_head.c.r_edge:
                    new_edge = self.c.r_edge

                    if not is_desc:
                        new_edge = old_head.i
                        for child in old_head.children:
                            if child == self:
                                continue
                            if child.c.r_edge > new_edge:
                                new_edge = child.c.r_edge
                        old_head.c.r_edge = new_edge

                    for anc in old_head.ancestors:
                        if anc.c.r_edge >= new_edge:
                            break
                        anc.c.r_edge = new_edge

            # update number of deps of new head
            if rel_newhead_i > 0:  # left dependent
                new_head.c.l_kids += 1
                # walk up the tree from new head and set l_edge to self.l_edge
                # until you hit a token with an l_edge further to the left
                if self.c.l_edge < new_head.c.l_edge:
                    new_head.c.l_edge = self.c.l_edge
                    for anc in new_head.ancestors:
                        if anc.c.l_edge <= self.c.l_edge:
                            break
                        anc.c.l_edge = self.c.l_edge

            elif rel_newhead_i < 0:  # right dependent
                new_head.c.r_kids += 1
                # do the same as for l_edge
                if self.c.r_edge > new_head.c.r_edge:
                    new_head.c.r_edge = self.c.r_edge
                    for anc in new_head.ancestors:
                        if anc.c.r_edge >= self.c.r_edge:
                            break
                        anc.c.r_edge = self.c.r_edge

            # set new head
            self.c.head = rel_newhead_i

    property conjuncts:
        """A sequence of coordinated tokens, including the token itself.

        YIELDS (Token): A coordinated token.
        """
        def __get__(self):
            """Get a list of conjoined words."""
            cdef Token word
            if 'conjuncts' in self.doc.user_token_hooks:
                yield from self.doc.user_token_hooks['conjuncts'](self)
            else:
                if self.dep_ != 'conj':
                    for word in self.rights:
                        if word.dep_ == 'conj':
                            yield word
                            yield from word.conjuncts

    property ent_type:
        """RETURNS (uint64): Named entity type."""
        def __get__(self):
            return self.c.ent_type

        def __set__(self, ent_type):
            self.c.ent_type = ent_type

    property ent_iob:
        """IOB code of named entity tag. `1="I", 2="O", 3="B"`. 0 means no tag
        is assigned.

        RETURNS (uint64): IOB code of named entity tag.
        """
        def __get__(self):
            return self.c.ent_iob

    property ent_type_:
        """RETURNS (unicode): Named entity type."""
        def __get__(self):
            return self.vocab.strings[self.c.ent_type]

        def __set__(self, ent_type):
            self.c.ent_type = self.vocab.strings.add(ent_type)

    property ent_iob_:
        """IOB code of named entity tag. "B" means the token begins an entity,
        "I" means it is inside an entity, "O" means it is outside an entity,
        and "" means no entity tag is set.

        RETURNS (unicode): IOB code of named entity tag.
        """
        def __get__(self):
            iob_strings = ('', 'I', 'O', 'B')
            return iob_strings[self.c.ent_iob]

    property ent_id:
        """RETURNS (uint64): ID of the entity the token is an instance of,
            if any.
        """
        def __get__(self):
            return self.c.ent_id

        def __set__(self, hash_t key):
            self.c.ent_id = key

    property ent_id_:
        """RETURNS (unicode): ID of the entity the token is an instance of,
            if any.
        """
        def __get__(self):
            return self.vocab.strings[self.c.ent_id]

        def __set__(self, name):
            self.c.ent_id = self.vocab.strings.add(name)

    property whitespace_:
        """RETURNS (unicode): The trailing whitespace character, if present.
        """
        def __get__(self):
            return ' ' if self.c.spacy else ''

    property orth_:
        """RETURNS (unicode): Verbatim text content (identical to
            `Token.text`). Exists mostly for consistency with the other
            attributes.
        """
        def __get__(self):
            return self.vocab.strings[self.c.lex.orth]

    property lower_:
        """RETURNS (unicode): The lowercase token text. Equivalent to
            `Token.text.lower()`.
        """
        def __get__(self):
            return self.vocab.strings[self.c.lex.lower]

    property norm_:
        """RETURNS (unicode): The token's norm, i.e. a normalised form of the
            token text. Usually set in the language's tokenizer exceptions or
            norm exceptions.
        """
        def __get__(self):
            return self.vocab.strings[self.norm]

        def __set__(self, unicode norm_):
            self.c.norm = self.vocab.strings.add(norm_)

    property shape_:
        """RETURNS (unicode): Transform of the tokens's string, to show
            orthographic features. For example, "Xxxx" or "dd".
        """
        def __get__(self):
            return self.vocab.strings[self.c.lex.shape]

    property prefix_:
        """RETURNS (unicode): A length-N substring from the start of the token.
            Defaults to `N=1`.
        """
        def __get__(self):
            return self.vocab.strings[self.c.lex.prefix]

    property suffix_:
        """RETURNS (unicode): A length-N substring from the end of the token.
            Defaults to `N=3`.
        """
        def __get__(self):
            return self.vocab.strings[self.c.lex.suffix]

    property lang_:
        """RETURNS (unicode): Language of the parent document's vocabulary,
            e.g. 'en'.
        """
        def __get__(self):
            return self.vocab.strings[self.c.lex.lang]

    property lemma_:
        """RETURNS (unicode): The token lemma, i.e. the base form of the word,
            with no inflectional suffixes.
        """
        def __get__(self):
            if self.c.lemma == 0:
                return self.vocab.morphology.lemmatizer.lookup(self.orth_)
            else:
                return self.vocab.strings[self.c.lemma]

        def __set__(self, unicode lemma_):
            self.c.lemma = self.vocab.strings.add(lemma_)

    property pos_:
        """RETURNS (unicode): Coarse-grained part-of-speech tag."""
        def __get__(self):
            return parts_of_speech.NAMES[self.c.pos]
        def __set__(self, pos_name):
            self.c.pos = parts_of_speech.IDS[pos_name]

    property tag_:
        """RETURNS (unicode): Fine-grained part-of-speech tag."""
        def __get__(self):
            return self.vocab.strings[self.c.tag]

        def __set__(self, tag):
            self.tag = self.vocab.strings.add(tag)

    property dep_:
        """RETURNS (unicode): The syntactic dependency label."""
        def __get__(self):
            return self.vocab.strings[self.c.dep]

        def __set__(self, unicode label):
            self.c.dep = self.vocab.strings.add(label)

    property is_oov:
        """RETURNS (bool): Whether the token is out-of-vocabulary."""
        def __get__(self):
            return Lexeme.c_check_flag(self.c.lex, IS_OOV)

    property is_stop:
        """RETURNS (bool): Whether the token is a stop word, i.e. part of a
            "stop list" defined by the language data.
        """
        def __get__(self):
            return Lexeme.c_check_flag(self.c.lex, IS_STOP)

    property is_alpha:
        """RETURNS (bool): Whether the token consists of alpha characters.
            Equivalent to `token.text.isalpha()`.
        """
        def __get__(self):
            return Lexeme.c_check_flag(self.c.lex, IS_ALPHA)

    property is_ascii:
        """RETURNS (bool): Whether the token consists of ASCII characters.
            Equivalent to `[any(ord(c) >= 128 for c in token.text)]`.
        """
        def __get__(self):
            return Lexeme.c_check_flag(self.c.lex, IS_ASCII)

    property is_digit:
        """RETURNS (bool): Whether the token consists of digits. Equivalent to
            `token.text.isdigit()`.
        """
        def __get__(self):
            return Lexeme.c_check_flag(self.c.lex, IS_DIGIT)

    property is_lower:
        """RETURNS (bool): Whether the token is in lowercase. Equivalent to
            `token.text.islower()`.
        """
        def __get__(self):
            return Lexeme.c_check_flag(self.c.lex, IS_LOWER)

    property is_upper:
        """RETURNS (bool): Whether the token is in uppercase. Equivalent to
            `token.text.isupper()`
        """
        def __get__(self):
            return Lexeme.c_check_flag(self.c.lex, IS_UPPER)

    property is_title:
        """RETURNS (bool): Whether the token is in titlecase. Equivalent to
            `token.text.istitle()`.
        """
        def __get__(self):
            return Lexeme.c_check_flag(self.c.lex, IS_TITLE)

    property is_punct:
        """RETURNS (bool): Whether the token is punctuation."""
        def __get__(self):
            return Lexeme.c_check_flag(self.c.lex, IS_PUNCT)

    property is_space:
        """RETURNS (bool): Whether the token consists of whitespace characters.
            Equivalent to `token.text.isspace()`.
        """
        def __get__(self):
            return Lexeme.c_check_flag(self.c.lex, IS_SPACE)

    property is_bracket:
        """RETURNS (bool): Whether the token is a bracket."""
        def __get__(self):
            return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)

    property is_quote:
        """RETURNS (bool): Whether the token is a quotation mark."""
        def __get__(self):
            return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)

    property is_left_punct:
        """RETURNS (bool): Whether the token is a left punctuation mark."""
        def __get__(self):
            return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)

    property is_right_punct:
        """RETURNS (bool): Whether the token is a right punctuation mark."""
        def __get__(self):
            return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)

    property is_currency:
        """RETURNS (bool): Whether the token is a currency symbol."""
        def __get__(self):
            return Lexeme.c_check_flag(self.c.lex, IS_CURRENCY)

    property like_url:
        """RETURNS (bool): Whether the token resembles a URL."""
        def __get__(self):
            return Lexeme.c_check_flag(self.c.lex, LIKE_URL)

    property like_num:
        """RETURNS (bool): Whether the token resembles a number, e.g. "10.9",
            "10", "ten", etc.
        """
        def __get__(self):
            return Lexeme.c_check_flag(self.c.lex, LIKE_NUM)

    property like_email:
        """RETURNS (bool): Whether the token resembles an email address."""
        def __get__(self):
            return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL)
-												Fix issue #672: ent_iob_ was a string, not unicode, due to missing unicode_literals statement.

											
										
										
											2016-12-19 00:33:53 +03:00
+								# cython: infer_types=True
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								# coding: utf8
-												Fix issue #672: ent_iob_ was a string, not unicode, due to missing unicode_literals statement.

											
										
										
											2016-12-19 00:33:53 +03:00
+								from __future__ import unicode_literals
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								from libc.string cimport memcpy
 								from cpython.mem cimport PyMem_Malloc, PyMem_Free
 								# Compiler crashes on memory view coercion without this. Should report bug.
 								from cython.view cimport array as cvarray
-												* Break up tokens.pyx into tokens/doc.pyx, tokens/token.pyx, tokens/spans.pyx

											
										
										
											2015-07-13 21:20:58 +03:00
+								cimport numpy as np
 								np.import_array()
 								import numpy
-												Import hash_t typedef in token.pyx

											
										
										
											2016-09-23 15:22:06 +03:00
+								from ..typedefs cimport hash_t
-												* Begin merge of Gazetteer and DE branches

											
										
										
											2015-09-06 20:45:15 +03:00
+								from ..lexeme cimport Lexeme
-												* Rename ATTR_IDS to attrs.IDS. Rename ATTR_NAMES to attrs.NAMES. Rename UNIV_POS_IDS to parts_of_speech.IDS

											
										
										
											2015-10-10 09:55:55 +03:00
+								from .. import parts_of_speech
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
+								from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
-												added new lex feat to token

											
										
										
											2018-02-11 20:55:48 +03:00
+								from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM, LIKE_EMAIL
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX
 								from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								from ..compat import is_config
-												💫 Add .similarity warnings for no vectors and option to exclude warnings (#2197)

* Add logic to filter out warning IDs via environment variable

Usage: SPACY_WARNING_EXCLUDE=W001,W007

* Add warnings for empty vectors

* Add warning if no word vectors are used in .similarity methods

For example, if only tensors are available in small models – should hopefully clear up some confusion around this

* Capture warnings in tests

* Rename SPACY_WARNING_EXCLUDE to SPACY_WARNING_IGNORE

											
										
										
											2018-05-21 02:22:38 +03:00
+								from ..errors import Errors, Warnings, user_warning, models_warning
-												Update deprecated methods and add warnings

											
										
										
											2017-11-01 18:49:42 +03:00
+								from .. import util
-												Don't raise error if set_extension has getter and setter (closes #2177)

Improve error messages, raise error if setter is specified without a getter and compare against _unset to allow default=None. Also add more tests.

											
										
										
											2018-04-03 19:30:17 +03:00
+								from .underscore import Underscore, get_ext_args
-												Remove enums from morphology

											
										
										
											2019-03-07 19:14:57 +03:00
+								from .morphanalysis cimport MorphAnalysis
-												* Work on language-independent refactoring

											
										
										
											2015-08-23 21:49:18 +03:00
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								cdef class Token:
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								    """An individual token – i.e. a word, punctuation symbol, whitespace,
 								    etc."""
-												Pass extensions into Underscore class

											
										
										
											2017-10-07 19:56:01 +03:00
+								    @classmethod
-												Don't raise error if set_extension has getter and setter (closes #2177)

Improve error messages, raise error if setter is specified without a getter and compare against _unset to allow default=None. Also add more tests.

											
										
										
											2018-04-03 19:30:17 +03:00
+								    def set_extension(cls, name, **kwargs):
 								        if cls.has_extension(name) and not kwargs.get('force', False):
 								            raise ValueError(Errors.E090.format(name=name, obj='Token'))
 								        Underscore.token_extensions[name] = get_ext_args(**kwargs)
-												Pass extensions into Underscore class

											
										
										
											2017-10-07 19:56:01 +03:00
 								    @classmethod
 								    def get_extension(cls, name):
-												Fix Token.set_extension

											
										
										
											2018-04-29 16:48:19 +03:00
+								        return Underscore.token_extensions.get(name)
-												Pass extensions into Underscore class

											
										
										
											2017-10-07 19:56:01 +03:00
 								    @classmethod
 								    def has_extension(cls, name):
-												Fix Token.set_extension

											
										
										
											2018-04-29 16:48:19 +03:00
+								        return name in Underscore.token_extensions
-												Pass extensions into Underscore class

											
										
										
											2017-10-07 19:56:01 +03:00
-												Add remove_extension method on Doc, Token and Span (closes #2242)

											
										
										
											2018-04-29 00:33:09 +03:00
+								    @classmethod
 								    def remove_extension(cls, name):
 								        if not cls.has_extension(name):
 								            raise ValueError(Errors.E046.format(name=name))
 								        return Underscore.token_extensions.pop(name)
-												* Go back to having token reference doc, instead of complicated gymnastics. Rename the attr 'doc', to expose it in the API

											
										
										
											2015-07-14 01:10:11 +03:00
+								    def __cinit__(self, Vocab vocab, Doc doc, int offset):
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """Construct a `Token` object.
 								        vocab (Vocab): A storage container for lexical types.
 								        doc (Doc): The parent document.
 								        offset (int): The index of the token within the document.
 								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        self.vocab = vocab
-												* Go back to having token reference doc, instead of complicated gymnastics. Rename the attr 'doc', to expose it in the API

											
										
										
											2015-07-14 01:10:11 +03:00
+								        self.doc = doc
-												* Rename Doc.data to Doc.c

											
										
										
											2015-11-03 16:15:14 +03:00
+								        self.c = &self.doc.c[offset]
-												* Go back to having token reference doc, instead of complicated gymnastics. Rename the attr 'doc', to expose it in the API

											
										
										
											2015-07-14 01:10:11 +03:00
+								        self.i = offset
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												Make Token hashable. Fixes #743

											
										
										
											2017-01-16 15:27:57 +03:00
+								    def __hash__(self):
 								        return hash((self.doc, self.i))
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								    def __len__(self):
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """The number of unicode characters in the token, i.e. `token.text`.
 								        RETURNS (int): The number of unicode characters in the token.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        return self.c.lex.length
 								    def __unicode__(self):
-												Remove deprecation shim around str/bytes in Token.

											
										
										
											2016-10-17 15:02:47 +03:00
+								        return self.text
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												fixed error when printing unicode

											
										
										
											2015-11-02 21:22:18 +03:00
+								    def __bytes__(self):
-												Remove deprecation shim around str/bytes in Token.

											
										
										
											2016-10-17 15:02:47 +03:00
+								        return self.text.encode('utf8')
-												fixed error when printing unicode

											
										
										
											2015-11-02 21:22:18 +03:00
-												* Fix string coercion for Python 3

											
										
										
											2015-07-24 04:49:30 +03:00
+								    def __str__(self):
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        if is_config(python3=True):
-												fixed error when printing unicode

											
										
										
											2015-11-02 21:22:18 +03:00
+								            return self.__unicode__()
 								        return self.__bytes__()
-												* Fix string coercion for Python 3

											
										
										
											2015-07-24 04:49:30 +03:00
-												added __repr__ that prints text in ipython for doc, token, and span objects

											
										
										
											2015-10-21 14:11:46 +03:00
+								    def __repr__(self):
-												fixed error when printing unicode

											
										
										
											2015-11-02 21:22:18 +03:00
+								        return self.__str__()
-												added __repr__ that prints text in ipython for doc, token, and span objects

											
										
										
											2015-10-21 14:11:46 +03:00
-												Amend 8ae8b443f: Handle comparison with None tokens.

											
										
										
											2017-01-11 15:03:32 +03:00
+								    def __richcmp__(self, Token other, int op):
-												Add richcmp method to Token. Closes #631

											
										
										
											2017-01-09 21:30:31 +03:00
+								        # http://cython.readthedocs.io/en/latest/src/userguide/special_methods.html
-												Fix rich comparison against None objects. Closes #1757

											
										
										
											2018-01-15 17:51:25 +03:00
+								        if other is None:
 								            if op in (0, 1, 2):
 								                return False
 								            else:
 								                return True
-												Fix comparison of Token from different docs. Closes #1257

											
										
										
											2017-08-19 17:39:32 +03:00
+								        cdef Doc my_doc = self.doc
 								        cdef Doc other_doc = other.doc
-												Add richcmp method to Token. Closes #631

											
										
										
											2017-01-09 21:30:31 +03:00
+								        my = self.idx
-												Fix rich comparison against None objects. Closes #1757

											
										
										
											2018-01-15 17:51:25 +03:00
+								        their = other.idx
-												Add richcmp method to Token. Closes #631

											
										
										
											2017-01-09 21:30:31 +03:00
+								        if op == 0:
 								            return my < their
 								        elif op == 2:
-												Fix comparison of Token from different docs. Closes #1257

											
										
										
											2017-08-19 17:39:32 +03:00
+								            if my_doc is other_doc:
 								                return my == their
 								            else:
 								                return False
-												Add richcmp method to Token. Closes #631

											
										
										
											2017-01-09 21:30:31 +03:00
+								        elif op == 4:
 								            return my > their
 								        elif op == 1:
 								            return my <= their
 								        elif op == 3:
-												Fix comparison of Token from different docs. Closes #1257

											
										
										
											2017-08-19 17:39:32 +03:00
+								            if my_doc is other_doc:
 								                return my != their
 								            else:
 								                return True
-												Add richcmp method to Token. Closes #631

											
										
										
											2017-01-09 21:30:31 +03:00
+								        elif op == 5:
 								            return my >= their
 								        else:
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								            raise ValueError(Errors.E041.format(op=op))
-												Add richcmp method to Token. Closes #631

											
										
										
											2017-01-09 21:30:31 +03:00
-												Raise better error if token is pickled (resolves #2833) (#3267)


											
										
										
											2019-02-13 13:27:04 +03:00
+								    def __reduce__(self):
 								        raise NotImplementedError(Errors.E111)
-												Pass extensions into Underscore class

											
										
										
											2017-10-07 19:56:01 +03:00
+								    @property
 								    def _(self):
 								        return Underscore(Underscore.token_extensions, self,
 								                          start=self.idx, end=None)
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								    cpdef bint check_flag(self, attr_id_t flag_id) except -1:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """Check the value of a boolean flag.
 								        flag_id (int): The ID of the flag attribute.
 								        RETURNS (bool): Whether the flag is set.
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        EXAMPLE:
 								            >>> from spacy.attrs import IS_TITLE
 								            >>> doc = nlp(u'Give it back! He pleaded.')
 								            >>> token = doc[0]
 								            >>> token.check_flag(IS_TITLE)
 								            True
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												* Fix ugly py_check_flag and py_set_flag functions in Lexeme

											
										
										
											2015-09-15 06:06:18 +03:00
+								        return Lexeme.c_check_flag(self.c.lex, flag_id)
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    def nbor(self, int i=1):
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """Get a neighboring token.
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        i (int): The relative position of the token to get. Defaults to 1.
 								        RETURNS (Token): The token at position `self.doc[self.i+i]`.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												Fix #1375 -- out-of-bounds on token.nbor()

											
										
										
											2017-10-24 13:10:39 +03:00
+								        if self.i+i < 0 or (self.i+i >= len(self.doc)):
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								            raise IndexError(Errors.E042.format(i=self.i, j=i, length=len(self.doc)))
-												* Go back to having token reference doc, instead of complicated gymnastics. Rename the attr 'doc', to expose it in the API

											
										
										
											2015-07-14 01:10:11 +03:00
+								        return self.doc[self.i+i]
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
+								    def similarity(self, other):
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """Make a semantic similarity estimate. The default estimate is cosine
 								        similarity using an average of word vectors.
 								        other (object): The object to compare with. By default, accepts `Doc`,
 								            `Span`, `Token` and `Lexeme` objects.
 								        RETURNS (float): A scalar similarity score. Higher is more similar.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												Add sentiment field to doc, rename getters_for_tokens and getters_for_spans, add user_hooks field to Doc.

											
										
										
											2016-10-19 21:54:03 +03:00
+								        if 'similarity' in self.doc.user_token_hooks:
-												Try using tensor for vector/similarity methdos

											
										
										
											2017-05-31 00:35:17 +03:00
+								            return self.doc.user_token_hooks['similarity'](self)
-												Test and fix for Issue #2219 (#2272)

Test and fix for Issue #2219: Token.similarity() failed if single letter
											
										
										
											2018-05-03 19:40:46 +03:00
+								        if hasattr(other, '__len__') and len(other) == 1 and hasattr(other, "__getitem__"):
-												Make .similarity() return 1.0 if all orth attrs match

											
										
										
											2018-01-15 18:29:48 +03:00
+								            if self.c.lex.orth == getattr(other[0], 'orth', None):
 								                return 1.0
 								        elif hasattr(other, 'orth'):
 								            if self.c.lex.orth == other.orth:
 								                return 1.0
-												💫 Add .similarity warnings for no vectors and option to exclude warnings (#2197)

* Add logic to filter out warning IDs via environment variable

Usage: SPACY_WARNING_EXCLUDE=W001,W007

* Add warnings for empty vectors

* Add warning if no word vectors are used in .similarity methods

For example, if only tensors are available in small models – should hopefully clear up some confusion around this

* Capture warnings in tests

* Rename SPACY_WARNING_EXCLUDE to SPACY_WARNING_IGNORE

											
										
										
											2018-05-21 02:22:38 +03:00
+								        if self.vocab.vectors.n_keys == 0:
 								            models_warning(Warnings.W007.format(obj='Token'))
-												* Fix vectors bugs for OOV words

											
										
										
											2015-09-22 03:10:01 +03:00
+								        if self.vector_norm == 0 or other.vector_norm == 0:
-												💫 Add .similarity warnings for no vectors and option to exclude warnings (#2197)

* Add logic to filter out warning IDs via environment variable

Usage: SPACY_WARNING_EXCLUDE=W001,W007

* Add warnings for empty vectors

* Add warning if no word vectors are used in .similarity methods

For example, if only tensors are available in small models – should hopefully clear up some confusion around this

* Capture warnings in tests

* Rename SPACY_WARNING_EXCLUDE to SPACY_WARNING_IGNORE

											
										
										
											2018-05-21 02:22:38 +03:00
+								            user_warning(Warnings.W008.format(obj='Token'))
-												* Fix vectors bugs for OOV words

											
										
										
											2015-09-22 03:10:01 +03:00
+								            return 0.0
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        return (numpy.dot(self.vector, other.vector) /
 								                (self.vector_norm * other.vector_norm))
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
-												Remove enums from morphology

											
										
										
											2019-03-07 19:14:57 +03:00
+								    property morph:
 								        def __get__(self):
 								            return MorphAnalysis.from_id(self.vocab, self.c.morph)
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								    property lex_id:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (int): Sequential ID of the token's lexical type."""
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
 								            return self.c.lex.id
-												* Add .rank property to Token and Lexeme, for frequency rank

											
										
										
											2015-11-08 18:18:25 +03:00
+								    property rank:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (int): Sequential ID of the token's lexical type, used to
 								        index into tables, e.g. for word vectors."""
-												* Add .rank property to Token and Lexeme, for frequency rank

											
										
										
											2015-11-08 18:18:25 +03:00
+								        def __get__(self):
 								            return self.c.lex.id
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								    property string:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """Deprecated: Use Token.text_with_ws instead."""
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												* Route token.string via token.txt_with_ws, to deprecate token.string in future

											
										
										
											2016-01-16 19:14:34 +03:00
+								            return self.text_with_ws
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												* Add test and test_with_ws attributes.

											
										
										
											2015-09-13 03:27:42 +03:00
+								    property text:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (unicode): The original verbatim text of the token."""
-												* Add test and test_with_ws attributes.

											
										
										
											2015-09-13 03:27:42 +03:00
+								        def __get__(self):
 								            return self.orth_
 								    property text_with_ws:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (unicode): The text content of the span (with trailing
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								            whitespace).
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """
-												* Add test and test_with_ws attributes.

											
										
										
											2015-09-13 03:27:42 +03:00
+								        def __get__(self):
-												Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.

											
										
										
											2016-09-30 21:20:22 +03:00
+								            cdef unicode orth = self.vocab.strings[self.c.lex.orth]
-												* Add test and test_with_ws attributes.

											
										
										
											2015-09-13 03:27:42 +03:00
+								            if self.c.spacy:
-												Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.

											
										
										
											2016-09-30 21:20:22 +03:00
+								                return orth + u' '
-												* Add test and test_with_ws attributes.

											
										
										
											2015-09-13 03:27:42 +03:00
+								            else:
-												Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.

											
										
										
											2016-09-30 21:20:22 +03:00
+								                return orth
-												* Add test and test_with_ws attributes.

											
										
										
											2015-09-13 03:27:42 +03:00
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								    property prob:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (float): Smoothed log probability estimate of token type."""
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
 								            return self.c.lex.prob
-												Add sentiment field to doc, rename getters_for_tokens and getters_for_spans, add user_hooks field to Doc.

											
										
										
											2016-10-19 21:54:03 +03:00
+								    property sentiment:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (float): A scalar value indicating the positivity or
 								            negativity of the token."""
-												Add sentiment field to doc, rename getters_for_tokens and getters_for_spans, add user_hooks field to Doc.

											
										
										
											2016-10-19 21:54:03 +03:00
+								        def __get__(self):
 								            if 'sentiment' in self.doc.user_token_hooks:
 								                return self.doc.user_token_hooks['sentiment'](self)
 								            return self.c.lex.sentiment
-												introduce lang field for LexemeC to hold language id
put noun_chunk logic into iterators.py for each language separately

											
										
										
											2016-03-10 15:01:34 +03:00
+								    property lang:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (uint64): ID of the language of the parent document's
 								            vocabulary.
 								        """
-												introduce lang field for LexemeC to hold language id
put noun_chunk logic into iterators.py for each language separately

											
										
										
											2016-03-10 15:01:34 +03:00
+								        def __get__(self):
 								            return self.c.lex.lang
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								    property idx:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (int): The character offset of the token within the parent
 								            document.
 								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
 								            return self.c.idx
 								    property cluster:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (int): Brown cluster ID."""
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
 								            return self.c.lex.cluster
 								    property orth:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (uint64): ID of the verbatim text content."""
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
 								            return self.c.lex.orth
 								    property lower:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (uint64): ID of the lowercase token text."""
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
 								            return self.c.lex.lower
 								    property norm:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (uint64): ID of the token's norm, i.e. a normalised form of
 								            the token text. Usually set in the language's tokenizer exceptions
 								            or norm exceptions.
 								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												Make NORM a token attribute (#3029)

See #3028. The solution in this patch is pretty debateable.

What we do is give the TokenC struct a .norm field, by repurposing the previously idle .sense attribute. It's nice to repurpose a previous field because it means the TokenC doesn't change size, so even if someone's using the internals very deeply, nothing will break.

The weird thing here is that the TokenC and the LexemeC both have an attribute named NORM. This arguably assists in backwards compatibility. On the other hand, maybe it's really bad! We're changing the semantics of the attribute subtly, so maybe it's better if someone calling lex.norm gets a breakage, and instead is told to write lex.default_norm?

Overall I believe this patch makes the NORM feature work the way we sort of expected it to work. Certainly it's much more like how the docs describe it, and more in line with how we've been directing people to use the norm attribute. We'll also be able to use token.norm to do stuff like spelling correction, which is pretty cool.

											
										
										
											2018-12-08 12:49:10 +03:00
+								            if self.c.norm == 0:
 								                return self.c.lex.norm
 								            else:
 								                return self.c.norm
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property shape:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (uint64): ID of the token's shape, a transform of the
 								            tokens's string, to show orthographic features (e.g. "Xxxx", "dd").
 								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
 								            return self.c.lex.shape
 								    property prefix:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (uint64): ID of a length-N substring from the start of the
 								            token. Defaults to `N=1`.
 								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
 								            return self.c.lex.prefix
 								    property suffix:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (uint64): ID of a length-N substring from the end of the
 								            token. Defaults to `N=3`.
 								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
 								            return self.c.lex.suffix
 								    property lemma:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (uint64): ID of the base form of the word, with no
 								            inflectional suffixes.
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												Use lookup lemmatizer if lemma unset

											
										
										
											2017-11-18 05:33:31 +03:00
+								            if self.c.lemma == 0:
-												lemma property to return hash instead of unicode

											
										
										
											2018-02-27 21:50:01 +03:00
+								                lemma_ = self.vocab.morphology.lemmatizer.lookup(self.orth_)
 								                return self.vocab.strings[lemma_]
-												Use lookup lemmatizer if lemma unset

											
										
										
											2017-11-18 05:33:31 +03:00
+								            else:
 								                return self.c.lemma
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
-												Finish stringstore change. Also xfail vectors tests

											
										
										
											2017-05-28 16:10:22 +03:00
+								        def __set__(self, attr_t lemma):
-												Allow lemma to be set from Python. Re #973

											
										
										
											2017-04-16 19:07:53 +03:00
+								            self.c.lemma = lemma
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property pos:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (uint64): ID of coarse-grained part-of-speech tag."""
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
 								            return self.c.pos
-												Fix #2014: token.pos_ not writeable

											
										
										
											2018-03-27 22:21:11 +03:00
+								        def __set__(self, pos):
 								            self.c.pos = pos
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property tag:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (uint64): ID of fine-grained part-of-speech tag."""
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
 								            return self.c.tag
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
-												Finish stringstore change. Also xfail vectors tests

											
										
										
											2017-05-28 16:10:22 +03:00
+								        def __set__(self, attr_t tag):
-												Fix #595: Lemmatization was incorrect for base forms, because morphological analyser wasn't adding morphology properly.

											
										
										
											2016-11-04 02:29:07 +03:00
+								            self.vocab.morphology.assign_tag(self.c, tag)
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property dep:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (uint64): ID of syntactic dependency label."""
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
 								            return self.c.dep
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
-												Finish stringstore change. Also xfail vectors tests

											
										
										
											2017-05-28 16:10:22 +03:00
+								        def __set__(self, attr_t label):
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								            self.c.dep = label
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												* Add has_vector attribute to Token and Lexeme

											
										
										
											2015-09-21 12:52:43 +03:00
+								    property has_vector:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """A boolean value indicating whether a word vector is associated with
 								        the object.
 								        RETURNS (bool): Whether a word vector is associated with the object.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												* Add has_vector attribute to Token and Lexeme

											
										
										
											2015-09-21 12:52:43 +03:00
+								        def __get__(self):
-												Add sentiment field to doc, rename getters_for_tokens and getters_for_spans, add user_hooks field to Doc.

											
										
										
											2016-10-19 21:54:03 +03:00
+								            if 'has_vector' in self.doc.user_token_hooks:
 								                return self.doc.user_token_hooks['has_vector'](self)
-												Back-off to tensor for similarity if no vectors

											
										
										
											2017-11-03 22:56:33 +03:00
+								            if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
 								                return True
-												Fix vector linkage for token

											
										
										
											2017-06-04 22:19:58 +03:00
+								            return self.vocab.has_vector(self.c.lex.orth)
-												* Add has_vector attribute to Token and Lexeme

											
										
										
											2015-09-21 12:52:43 +03:00
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
+								    property vector:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """A real-valued meaning representation.
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
 								            representing the token's semantics.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												Add sentiment field to doc, rename getters_for_tokens and getters_for_spans, add user_hooks field to Doc.

											
										
										
											2016-10-19 21:54:03 +03:00
+								            if 'vector' in self.doc.user_token_hooks:
 								                return self.doc.user_token_hooks['vector'](self)
-												Back-off to tensor for similarity if no vectors

											
										
										
											2017-11-03 22:56:33 +03:00
+								            if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
 								                return self.doc.tensor[self.i]
 								            else:
 								                return self.vocab.get_vector(self.c.lex.orth)
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
+								    property vector_norm:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-20 16:13:33 +03:00
+								        """The L2 norm of the token's vector representation.
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
 								        RETURNS (float): The L2 norm of the vector representation.
 								        """
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
+								        def __get__(self):
-												Add sentiment field to doc, rename getters_for_tokens and getters_for_spans, add user_hooks field to Doc.

											
										
										
											2016-10-19 21:54:03 +03:00
+								            if 'vector_norm' in self.doc.user_token_hooks:
 								                return self.doc.user_token_hooks['vector_norm'](self)
-												Fix typo

											
										
										
											2017-10-10 05:15:14 +03:00
+								            vector = self.vector
-												Re-delegate vectors to vocab

											
										
										
											2017-05-28 12:46:10 +03:00
+								            return numpy.sqrt((vector ** 2).sum())
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								    property n_lefts:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (int): The number of leftward immediate children of the
 								            word, in the syntactic dependency parse.
 								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												revert init_model.py back to pre-german state (because it makes more sense)
simplify token.n_rights and token.n_lefts

											
										
										
											2016-03-21 18:10:25 +03:00
+								            return self.c.l_kids
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property n_rights:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (int): The number of rightward immediate children of the
 								            word, in the syntactic dependency parse.
 								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												revert init_model.py back to pre-german state (because it makes more sense)
simplify token.n_rights and token.n_lefts

											
										
										
											2016-03-21 18:10:25 +03:00
+								            return self.c.r_kids
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												Add sent property to token (#2521)

* Add sent property to token

* Refactored and cleaned up copy paste errors.

											
										
										
											2018-07-06 16:54:15 +03:00
+								    property sent:
 								        """RETURNS (Span): The sentence span that the token is a part of."""
 								        def __get__(self):
 								            if 'sent' in self.doc.user_token_hooks:
 								                return self.doc.user_token_hooks['sent'](self)
 								            return self.doc[self.i : self.i+1].sent
-												* Add Token.sent_start property, re Issue #235

											
										
										
											2016-05-05 12:53:20 +03:00
+								    property sent_start:
 								        def __get__(self):
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								            # Raising a deprecation warning here causes errors for autocomplete
-												Add Token.is_sent_start property, so can deprecate Token.sent_start

											
										
										
											2017-11-01 15:27:14 +03:00
+								            # Handle broken backwards compatibility case: doc[0].sent_start
 								            # was False.
 								            if self.i == 0:
 								                return False
 								            else:
-												Fix infinite recursion in token.sent_start. Closes #1640

											
										
										
											2018-01-14 17:02:15 +03:00
+								                return self.c.sent_start
-												Add Token.is_sent_start property, so can deprecate Token.sent_start

											
										
										
											2017-11-01 15:27:14 +03:00
 								        def __set__(self, value):
 								            self.is_sent_start = value
 								    property is_sent_start:
 								        """RETURNS (bool / None): Whether the token starts a sentence.
 								            None if unknown.
 								        """
 								        def __get__(self):
 								            if self.c.sent_start == 0:
 								                return None
 								            elif self.c.sent_start < 0:
 								                return False
 								            else:
 								                return True
-												* Add Token.sent_start property, re Issue #235

											
										
										
											2016-05-05 12:53:20 +03:00
-												Add ternary value setting to Token.sent_start

											
										
										
											2017-10-09 00:51:58 +03:00
+								        def __set__(self, value):
-												* Add Token.sent_start property, re Issue #235

											
										
										
											2016-05-05 12:53:20 +03:00
+								            if self.doc.is_parsed:
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								                raise ValueError(Errors.E043)
-												Add ternary value setting to Token.sent_start

											
										
										
											2017-10-09 00:51:58 +03:00
+								            if value is None:
 								                self.c.sent_start = 0
 								            elif value is True:
 								                self.c.sent_start = 1
 								            elif value is False:
 								                self.c.sent_start = -1
 								            else:
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								                raise ValueError(Errors.E044.format(value=value))
-												* Add Token.sent_start property, re Issue #235

											
										
										
											2016-05-05 12:53:20 +03:00
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								    property lefts:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """The leftward immediate children of the word, in the syntactic
 								        dependency parse.
 								        YIELDS (Token): A left-child of the token.
 								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												* Add loop guard to Token.lefts and Token.rights properties

											
										
										
											2016-01-16 18:18:17 +03:00
+								            cdef int nr_iter = 0
-												* Fix bug in token subtree, introduced by duplication of L/R code in Stateclass. Need to consolidate the two methods.

											
										
										
											2015-09-06 11:48:36 +03:00
+								            cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge)
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								            while ptr < self.c:
-												integrated pseudo-projective parsing into parser

- nonproj.pyx holds a class PseudoProjectivity which currently holds
  all functionality to implement Nivre & Nilsson 2005's pseudo-projective
  parsing using the HEAD decoration scheme
- changed lefts/rights in Token to account for possible non-projective
  structures

											
										
										
											2016-03-01 12:09:08 +03:00
+								                if ptr + ptr.head == self.c:
-												* Go back to having token reference doc, instead of complicated gymnastics. Rename the attr 'doc', to expose it in the API

											
										
										
											2015-07-14 01:10:11 +03:00
+								                    yield self.doc[ptr - (self.c - self.i)]
-												integrated pseudo-projective parsing into parser

- nonproj.pyx holds a class PseudoProjectivity which currently holds
  all functionality to implement Nivre & Nilsson 2005's pseudo-projective
  parsing using the HEAD decoration scheme
- changed lefts/rights in Token to account for possible non-projective
  structures

											
										
										
											2016-03-01 12:09:08 +03:00
+								                ptr += 1
-												* Add loop guard to Token.lefts and Token.rights properties

											
										
										
											2016-01-16 18:18:17 +03:00
+								                nr_iter += 1
 								                # This is ugly, but it's a way to guard out infinite loops
 								                if nr_iter >= 10000000:
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								                    raise RuntimeError(Errors.E045.format(attr='token.lefts'))
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property rights:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """The rightward immediate children of the word, in the syntactic
 								        dependency parse.
 								        YIELDS (Token): A right-child of the token.
 								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												* Fix bug in token subtree, introduced by duplication of L/R code in Stateclass. Need to consolidate the two methods.

											
										
										
											2015-09-06 11:48:36 +03:00
+								            cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								            tokens = []
-												* Add loop guard to Token.lefts and Token.rights properties

											
										
										
											2016-01-16 18:18:17 +03:00
+								            cdef int nr_iter = 0
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								            while ptr > self.c:
-												integrated pseudo-projective parsing into parser

- nonproj.pyx holds a class PseudoProjectivity which currently holds
  all functionality to implement Nivre & Nilsson 2005's pseudo-projective
  parsing using the HEAD decoration scheme
- changed lefts/rights in Token to account for possible non-projective
  structures

											
										
										
											2016-03-01 12:09:08 +03:00
+								                if ptr + ptr.head == self.c:
-												* Go back to having token reference doc, instead of complicated gymnastics. Rename the attr 'doc', to expose it in the API

											
										
										
											2015-07-14 01:10:11 +03:00
+								                    tokens.append(self.doc[ptr - (self.c - self.i)])
-												integrated pseudo-projective parsing into parser

- nonproj.pyx holds a class PseudoProjectivity which currently holds
  all functionality to implement Nivre & Nilsson 2005's pseudo-projective
  parsing using the HEAD decoration scheme
- changed lefts/rights in Token to account for possible non-projective
  structures

											
										
										
											2016-03-01 12:09:08 +03:00
+								                ptr -= 1
 								                nr_iter += 1
-												* Add loop guard to Token.lefts and Token.rights properties

											
										
										
											2016-01-16 18:18:17 +03:00
+								                if nr_iter >= 10000000:
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								                    raise RuntimeError(Errors.E045.format(attr='token.rights'))
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								            tokens.reverse()
 								            for t in tokens:
 								                yield t
 								    property children:
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """A sequence of the token's immediate syntactic children.
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        YIELDS (Token): A child token such that child.head==self
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
 								            yield from self.lefts
 								            yield from self.rights
 								    property subtree:
-												Correct docs of `Token.subtree` and `Span.subtree` (issue #3122) (#3124)

* solve inconsistency between docs and Span.subtree (issue #3122)

* solve inconsistency between docs and Token.subtree (issue #3122)

											
										
										
											2019-01-09 05:11:15 +03:00
+								        """A sequence containing the token and all the token's syntactic
 								        descendants.
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        YIELDS (Token): A descendent token such that
-												Correct docs of `Token.subtree` and `Span.subtree` (issue #3122) (#3124)

* solve inconsistency between docs and Span.subtree (issue #3122)

* solve inconsistency between docs and Token.subtree (issue #3122)

											
										
										
											2019-01-09 05:11:15 +03:00
+								            `self.is_ancestor(descendent) or token == self`.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
 								            for word in self.lefts:
 								                yield from word.subtree
 								            yield self
 								            for word in self.rights:
 								                yield from word.subtree
 								    property left_edge:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """The leftmost token of this token's syntactic descendents.
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        RETURNS (Token): The first token such that `self.is_ancestor(token)`.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												* Go back to having token reference doc, instead of complicated gymnastics. Rename the attr 'doc', to expose it in the API

											
										
										
											2015-07-14 01:10:11 +03:00
+								            return self.doc[self.c.l_edge]
-												* Whitespace

											
										
										
											2015-08-09 00:37:44 +03:00
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								    property right_edge:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """The rightmost token of this token's syntactic descendents.
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        RETURNS (Token): The last token such that `self.is_ancestor(token)`.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												* Go back to having token reference doc, instead of complicated gymnastics. Rename the attr 'doc', to expose it in the API

											
										
										
											2015-07-14 01:10:11 +03:00
+								            return self.doc[self.c.r_edge]
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								    property ancestors:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """A sequence of this token's syntactic ancestors.
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        YIELDS (Token): A sequence of ancestor tokens such that
 								            `ancestor.is_ancestor(self)`.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								        def __get__(self):
 								            cdef const TokenC* head_ptr = self.c
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
+								            # guard against infinite loop, no token can have
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								            # more ancestors than tokens in the tree
 								            cdef int i = 0
 								            while head_ptr.head != 0 and i < self.doc.length:
 								                head_ptr += head_ptr.head
 								                yield self.doc[head_ptr - (self.c - self.i)]
 								                i += 1
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
+								    def is_ancestor(self, descendant):
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """Check whether this token is a parent, grandparent, etc. of another
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
+								        in the dependency tree.
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        descendant (Token): Another token.
 								        RETURNS (bool): Whether this token is the ancestor of the descendant.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												Fix variable error in token

											
										
										
											2016-11-01 15:28:00 +03:00
+								        if self.doc is not descendant.doc:
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
+								            return False
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        return any(ancestor.i == self.i for ancestor in descendant.ancestors)
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								    property head:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """The syntactic parent, or "governor", of this token.
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        RETURNS (Token): The token predicted by the parser to be the head of
 								            the current token.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												* Go back to having token reference doc, instead of complicated gymnastics. Rename the attr 'doc', to expose it in the API

											
										
										
											2015-07-14 01:10:11 +03:00
+								            return self.doc[self.i + self.c.head]
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								        def __set__(self, Token new_head):
 								            # this function sets the head of self to new_head
 								            # and updates the counters for left/right dependents
 								            # and left/right corner for the new and the old head
 								            # do nothing if old head is new head
 								            if self.i + self.c.head == new_head.i:
 								                return
 								            cdef Token old_head = self.head
 								            cdef int rel_newhead_i = new_head.i - self.i
 								            # is the new head a descendant of the old head
-												Use is_ancestor instead of deprecated is_ancestor_of

											
										
										
											2017-05-19 21:23:40 +03:00
+								            cdef bint is_desc = old_head.is_ancestor(new_head)
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								            cdef int new_edge
-												changed head.__set__ to make it simpler

											
										
										
											2016-03-14 15:43:48 +03:00
+								            cdef Token anc, child
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
 								            # update number of deps of old head
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								            if self.c.head > 0:  # left dependent
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								                old_head.c.l_kids -= 1
 								                if self.c.l_edge == old_head.c.l_edge:
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								                    # the token dominates the left edge so the left edge of
 								                    # the  head may change when the token is reattached, it may
 								                    # not change if the new head is a descendant of the current
 								                    # head
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
 								                    new_edge = self.c.l_edge
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								                    # the new l_edge is the left-most l_edge on any of the
 								                    # other dependents where the l_edge is left of the head,
 								                    # otherwise it is the head
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								                    if not is_desc:
-												changed head.__set__ to make it simpler

											
										
										
											2016-03-14 15:43:48 +03:00
+								                        new_edge = old_head.i
 								                        for child in old_head.children:
 								                            if child == self:
 								                                continue
 								                            if child.c.l_edge < new_edge:
 								                                new_edge = child.c.l_edge
 								                        old_head.c.l_edge = new_edge
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								                    # walk up the tree from old_head and assign new l_edge to
 								                    # ancestors until an ancestor already has an l_edge that's
 								                    # further left
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								                    for anc in old_head.ancestors:
 								                        if anc.c.l_edge <= new_edge:
 								                            break
 								                        anc.c.l_edge = new_edge
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								            elif self.c.head < 0:  # right dependent
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								                old_head.c.r_kids -= 1
 								                # do the same thing as for l_edge
 								                if self.c.r_edge == old_head.c.r_edge:
 								                    new_edge = self.c.r_edge
-												changed head.__set__ to make it simpler

											
										
										
											2016-03-14 15:43:48 +03:00
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								                    if not is_desc:
-												changed head.__set__ to make it simpler

											
										
										
											2016-03-14 15:43:48 +03:00
+								                        new_edge = old_head.i
 								                        for child in old_head.children:
 								                            if child == self:
 								                                continue
 								                            if child.c.r_edge > new_edge:
 								                                new_edge = child.c.r_edge
 								                        old_head.c.r_edge = new_edge
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								                    for anc in old_head.ancestors:
 								                        if anc.c.r_edge >= new_edge:
 								                            break
 								                        anc.c.r_edge = new_edge
 								            # update number of deps of new head
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								            if rel_newhead_i > 0:  # left dependent
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								                new_head.c.l_kids += 1
 								                # walk up the tree from new head and set l_edge to self.l_edge
 								                # until you hit a token with an l_edge further to the left
 								                if self.c.l_edge < new_head.c.l_edge:
-												changed head.__set__ to make it simpler

											
										
										
											2016-03-14 15:43:48 +03:00
+								                    new_head.c.l_edge = self.c.l_edge
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								                    for anc in new_head.ancestors:
-												changed head.__set__ to make it simpler

											
										
										
											2016-03-14 15:43:48 +03:00
+								                        if anc.c.l_edge <= self.c.l_edge:
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								                            break
-												changed head.__set__ to make it simpler

											
										
										
											2016-03-14 15:43:48 +03:00
+								                        anc.c.l_edge = self.c.l_edge
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								            elif rel_newhead_i < 0:  # right dependent
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								                new_head.c.r_kids += 1
 								                # do the same as for l_edge
 								                if self.c.r_edge > new_head.c.r_edge:
-												changed head.__set__ to make it simpler

											
										
										
											2016-03-14 15:43:48 +03:00
+								                    new_head.c.r_edge = self.c.r_edge
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								                    for anc in new_head.ancestors:
-												changed head.__set__ to make it simpler

											
										
										
											2016-03-14 15:43:48 +03:00
+								                        if anc.c.r_edge >= self.c.r_edge:
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								                            break
-												changed head.__set__ to make it simpler

											
										
										
											2016-03-14 15:43:48 +03:00
+								                        anc.c.r_edge = self.c.r_edge
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
 								            # set new head
 								            self.c.head = rel_newhead_i
-												* Whitespace

											
										
										
											2015-08-09 00:37:44 +03:00
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								    property conjuncts:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """A sequence of coordinated tokens, including the token itself.
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        YIELDS (Token): A coordinated token.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												* Fix token.conjuncts method

											
										
										
											2015-10-14 19:34:57 +03:00
+								            """Get a list of conjoined words."""
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								            cdef Token word
-												Add sentiment field to doc, rename getters_for_tokens and getters_for_spans, add user_hooks field to Doc.

											
										
										
											2016-10-19 21:54:03 +03:00
+								            if 'conjuncts' in self.doc.user_token_hooks:
 								                yield from self.doc.user_token_hooks['conjuncts'](self)
-												Defer some attributes to Doc, via getters_for_tokens attribute.

											
										
										
											2016-10-17 03:44:49 +03:00
+								            else:
 								                if self.dep_ != 'conj':
 								                    for word in self.rights:
 								                        if word.dep_ == 'conj':
 								                            yield word
 								                            yield from word.conjuncts
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property ent_type:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (uint64): Named entity type."""
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
 								            return self.c.ent_type
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
-												Finish stringstore change. Also xfail vectors tests

											
										
										
											2017-05-28 16:10:22 +03:00
+								        def __set__(self, ent_type):
 								            self.c.ent_type = ent_type
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property ent_iob:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """IOB code of named entity tag. `1="I", 2="O", 3="B"`. 0 means no tag
 								        is assigned.
-												Finish stringstore change. Also xfail vectors tests

											
										
										
											2017-05-28 16:10:22 +03:00
+								        RETURNS (uint64): IOB code of named entity tag.
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
 								            return self.c.ent_iob
 								    property ent_type_:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (unicode): Named entity type."""
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.

											
										
										
											2016-09-30 21:20:22 +03:00
+								            return self.vocab.strings[self.c.ent_type]
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
-												Finish stringstore change. Also xfail vectors tests

											
										
										
											2017-05-28 16:10:22 +03:00
+								        def __set__(self, ent_type):
 								            self.c.ent_type = self.vocab.strings.add(ent_type)
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property ent_iob_:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """IOB code of named entity tag. "B" means the token begins an entity,
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        "I" means it is inside an entity, "O" means it is outside an entity,
 								        and "" means no entity tag is set.
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
 								        RETURNS (unicode): IOB code of named entity tag.
 								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
 								            iob_strings = ('', 'I', 'O', 'B')
 								            return iob_strings[self.c.ent_iob]
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 15:54:55 +03:00
+								    property ent_id:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (uint64): ID of the entity the token is an instance of,
 								            if any.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 15:54:55 +03:00
+								        def __get__(self):
-												Fix token.pyx

											
										
										
											2016-09-23 16:07:07 +03:00
+								            return self.c.ent_id
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 15:54:55 +03:00
 								        def __set__(self, hash_t key):
-												Allow ent_id to be set in Token

											
										
										
											2017-03-31 15:00:14 +03:00
+								            self.c.ent_id = key
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 15:54:55 +03:00
 								    property ent_id_:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (unicode): ID of the entity the token is an instance of,
 								            if any.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 15:54:55 +03:00
+								        def __get__(self):
-												Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.

											
										
										
											2016-09-30 21:20:22 +03:00
+								            return self.vocab.strings[self.c.ent_id]
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 15:54:55 +03:00
-												Allow ent_id to be set in Token

											
										
										
											2017-03-31 15:00:14 +03:00
+								        def __set__(self, name):
-												Finish stringstore change. Also xfail vectors tests

											
										
										
											2017-05-28 16:10:22 +03:00
+								            self.c.ent_id = self.vocab.strings.add(name)
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 15:54:55 +03:00
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								    property whitespace_:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (unicode): The trailing whitespace character, if present.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												* Fix whitespace_ calculation in Token

											
										
										
											2015-10-18 09:21:11 +03:00
+								            return ' ' if self.c.spacy else ''
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property orth_:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (unicode): Verbatim text content (identical to
-												💫 Port master changes over to develop (#2979)

* Create aryaprabhudesai.md (#2681)

* Update _install.jade (#2688)

Typo fix: "models" -> "model"

* Add FAC to spacy.explain (resolves #2706)

* Remove docstrings for deprecated arguments (see #2703)

* When calling getoption() in conftest.py, pass a default option (#2709)

* When calling getoption() in conftest.py, pass a default option

This is necessary to allow testing an installed spacy by running:

  pytest --pyargs spacy

* Add contributor agreement

* update bengali token rules for hyphen and digits (#2731)

* Less norm computations in token similarity (#2730)

* Less norm computations in token similarity

* Contributor agreement

* Remove ')' for clarity (#2737)

Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.

* added contributor agreement for mbkupfer (#2738)

* Basic support for Telugu language (#2751)

* Lex _attrs for polish language (#2750)

* Signed spaCy contributor agreement

* Added polish version of english lex_attrs

* Introduces a bulk merge function, in order to solve issue #653 (#2696)

* Fix comment

* Introduce bulk merge to increase performance on many span merges

* Sign contributor agreement

* Implement pull request suggestions

* Describe converters more explicitly (see #2643)

* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]

* Fix formatting

* Fix dependency scheme docs (closes #2705) [ci skip]

* Don't set stop word in example (closes #2657) [ci skip]

* Add words to portuguese language _num_words (#2759)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Update Indonesian model (#2752)

* adding e-KTP in tokenizer exceptions list

* add exception token

* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception

* add tokenizer exceptions list

* combining base_norms with norm_exceptions

* adding norm_exception

* fix double key in lemmatizer

* remove unused import on punctuation.py

* reformat stop_words to reduce number of lines, improve readibility

* updating tokenizer exception

* implement is_currency for lang/id

* adding orth_first_upper in tokenizer_exceptions

* update the norm_exception list

* remove bunch of abbreviations

* adding contributors file

* Fixed spaCy+Keras example (#2763)

* bug fixes in keras example

* created contributor agreement

* Adding French hyphenated first name (#2786)

* Fix typo (closes #2784)

* Fix typo (#2795) [ci skip]

Fixed typo on line 6 "regcognizer --> recognizer"

* Adding basic support for Sinhala language. (#2788)

* adding Sinhala language package, stop words, examples and lex_attrs.

* Adding contributor agreement

* Updating contributor agreement

* Also include lowercase norm exceptions

* Fix error (#2802)

* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way.  Use the resize function

* added spaCy Contributor Agreement

* Add charlax's contributor agreement (#2805)

* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)

* Contributors agreement

* Contributors agreement

* Contributors agreement

* Add jupyter=True to displacy.render in documentation (#2806)

* Revert "Also include lowercase norm exceptions"

This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.

* Remove deprecated encoding argument to msgpack

* Set up dependency tree pattern matching skeleton (#2732)

* Fix bug when too many entity types. Fixes #2800

* Fix Python 2 test failure

* Require older msgpack-numpy

* Restore encoding arg on msgpack-numpy

* Try to fix version pin for msgpack-numpy

* Update Portuguese Language (#2790)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols

* Extended punctuation and norm_exceptions in the Portuguese language

* Correct error in spacy universe docs concerning spacy-lookup (#2814)

* Update Keras Example for (Parikh et al, 2016) implementation  (#2803)

* bug fixes in keras example

* created contributor agreement

* baseline for Parikh model

* initial version of parikh 2016 implemented

* tested asymmetric models

* fixed grevious error in normalization

* use standard SNLI test file

* begin to rework parikh example

* initial version of running example

* start to document the new version

* start to document the new version

* Update Decompositional Attention.ipynb

* fixed calls to similarity

* updated the README

* import sys package duh

* simplified indexing on mapping word to IDs

* stupid python indent error

* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround

* Fix typo (closes #2815) [ci skip]

* Update regex version dependency

* Set version to 2.0.13.dev3

* Skip seemingly problematic test

* Remove problematic test

* Try previous version of regex

* Revert "Remove problematic test"

This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.

* Unskip test

* Try older version of regex

* 💫 Update training examples and use minibatching (#2830)

<!--- Provide a general summary of your changes in the title. -->

## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.

### Types of change
enhancements

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Visual C++ link updated (#2842) (closes #2841) [ci skip]

* New landing page

* Add contribution agreement

* Correcting lang/ru/examples.py (#2845)

* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement

* Correct some grammatical inaccuracies in lang\ru\examples.py

* Move contributor agreement to separate file

* Set version to 2.0.13.dev4

* Add Persian(Farsi) language support (#2797)

* Also include lowercase norm exceptions

* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors

* Rule-based French Lemmatizer (#2818)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech 
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Set version to 2.0.13

* Fix formatting and consistency

* Update docs for new version [ci skip]

* Increment version [ci skip]

* Add info on wheels [ci skip]

* Adding "This is a sentence" example to Sinhala (#2846)

* Add wheels badge

* Update badge [ci skip]

* Update README.rst [ci skip]

* Update murmurhash pin

* Increment version to 2.0.14.dev0

* Update GPU docs for v2.0.14

* Add wheel to setup_requires

* Import prefer_gpu and require_gpu functions from Thinc

* Add tests for prefer_gpu() and require_gpu()

* Update requirements and setup.py

* Workaround bug in thinc require_gpu

* Set version to v2.0.14

* Update push-tag script

* Unhack prefer_gpu

* Require thinc 6.10.6

* Update prefer_gpu and require_gpu docs [ci skip]

* Fix specifiers for GPU

* Set version to 2.0.14.dev1

* Set version to 2.0.14

* Update Thinc version pin

* Increment version

* Fix msgpack-numpy version pin

* Increment version

* Update version to 2.0.16

* Update version [ci skip]

* Redundant ')' in the Stop words' example (#2856)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Documentation improvement regarding joblib and SO (#2867)

Some documentation improvements

## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)

### Types of change
Documentation

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* raise error when setting overlapping entities as doc.ents (#2880)

* Fix out-of-bounds access in NER training

The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!

This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.

* Change PyThaiNLP Url (#2876)

* Fix missing comma

* Add example showing a fix-up rule for space entities

* Set version to 2.0.17.dev0

* Update regex version

* Revert "Update regex version"

This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.

* Try setting older regex version, to align with conda

* Set version to 2.0.17

* Add spacy-js to universe [ci-skip]

* Add spacy-raspberry to universe (closes #2889)

* Add script to validate universe json [ci skip]

* Removed space in docs + added contributor indo (#2909)

* - removed unneeded space in documentation

* - added contributor info

* Allow input text of length up to max_length, inclusive (#2922)

* Include universe spec for spacy-wordnet component (#2919)

* feat: include universe spec for spacy-wordnet component

* chore: include spaCy contributor agreement

* Minor formatting changes [ci skip]

* Fix image [ci skip]

Twitter URL doesn't work on live site

* Check if the word is in one of the regular lists specific to each POS (#2886)

* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)

Resolves #2924.

## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)

### Types of change
bug fix

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix typo [ci skip]

* fixes symbolic link on py3 and windows (#2949)

* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948

* Update spacy/compat.py

Co-Authored-By: cicorias <cicorias@users.noreply.github.com>

* Fix formatting

* Update universe [ci skip]

* Catalan Language Support (#2940)

* Catalan language Support

* Ddding Catalan to documentation

* Sort languages alphabetically [ci skip]

* Update tests for pytest 4.x (#2965)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix regex pin to harmonize with conda (#2964)

* Update README.rst

* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)

Fixes #2976

* Fix typo

* Fix typo

* Remove duplicate file

* Require thinc 7.0.0.dev2

Fixes bug in gpu_ops that would use cupy instead of numpy on CPU

* Add missing import

* Fix error IDs

* Fix tests

											
										
										
											2018-11-29 18:30:29 +03:00
+								            `Token.text`). Exists mostly for consistency with the other
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								            attributes.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.

											
										
										
											2016-09-30 21:20:22 +03:00
+								            return self.vocab.strings[self.c.lex.orth]
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property lower_:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (unicode): The lowercase token text. Equivalent to
 								            `Token.text.lower()`.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.

											
										
										
											2016-09-30 21:20:22 +03:00
+								            return self.vocab.strings[self.c.lex.lower]
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property norm_:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (unicode): The token's norm, i.e. a normalised form of the
 								            token text. Usually set in the language's tokenizer exceptions or
 								            norm exceptions.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												Make NORM a token attribute (#3029)

See #3028. The solution in this patch is pretty debateable.

What we do is give the TokenC struct a .norm field, by repurposing the previously idle .sense attribute. It's nice to repurpose a previous field because it means the TokenC doesn't change size, so even if someone's using the internals very deeply, nothing will break.

The weird thing here is that the TokenC and the LexemeC both have an attribute named NORM. This arguably assists in backwards compatibility. On the other hand, maybe it's really bad! We're changing the semantics of the attribute subtly, so maybe it's better if someone calling lex.norm gets a breakage, and instead is told to write lex.default_norm?

Overall I believe this patch makes the NORM feature work the way we sort of expected it to work. Certainly it's much more like how the docs describe it, and more in line with how we've been directing people to use the norm attribute. We'll also be able to use token.norm to do stuff like spelling correction, which is pretty cool.

											
										
										
											2018-12-08 12:49:10 +03:00
+								            return self.vocab.strings[self.norm]
 								        def __set__(self, unicode norm_):
 								            self.c.norm = self.vocab.strings.add(norm_)
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property shape_:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (unicode): Transform of the tokens's string, to show
 								            orthographic features. For example, "Xxxx" or "dd".
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.

											
										
										
											2016-09-30 21:20:22 +03:00
+								            return self.vocab.strings[self.c.lex.shape]
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property prefix_:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (unicode): A length-N substring from the start of the token.
 								            Defaults to `N=1`.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.

											
										
										
											2016-09-30 21:20:22 +03:00
+								            return self.vocab.strings[self.c.lex.prefix]
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property suffix_:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (unicode): A length-N substring from the end of the token.
 								            Defaults to `N=3`.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.

											
										
										
											2016-09-30 21:20:22 +03:00
+								            return self.vocab.strings[self.c.lex.suffix]
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												introduce lang field for LexemeC to hold language id
put noun_chunk logic into iterators.py for each language separately

											
										
										
											2016-03-10 15:01:34 +03:00
+								    property lang_:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (unicode): Language of the parent document's vocabulary,
 								            e.g. 'en'.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												introduce lang field for LexemeC to hold language id
put noun_chunk logic into iterators.py for each language separately

											
										
										
											2016-03-10 15:01:34 +03:00
+								        def __get__(self):
-												Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.

											
										
										
											2016-09-30 21:20:22 +03:00
+								            return self.vocab.strings[self.c.lex.lang]
-												introduce lang field for LexemeC to hold language id
put noun_chunk logic into iterators.py for each language separately

											
										
										
											2016-03-10 15:01:34 +03:00
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								    property lemma_:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (unicode): The token lemma, i.e. the base form of the word,
 								            with no inflectional suffixes.
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												Use lookup lemmatizer if lemma unset

											
										
										
											2017-11-18 05:33:31 +03:00
+								            if self.c.lemma == 0:
 								                return self.vocab.morphology.lemmatizer.lookup(self.orth_)
 								            else:
 								                return self.vocab.strings[self.c.lemma]
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
-												Allow lemma to be set from Python. Re #973

											
										
										
											2017-04-16 19:07:53 +03:00
+								        def __set__(self, unicode lemma_):
-												Finish stringstore change. Also xfail vectors tests

											
										
										
											2017-05-28 16:10:22 +03:00
+								            self.c.lemma = self.vocab.strings.add(lemma_)
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property pos_:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (unicode): Coarse-grained part-of-speech tag."""
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												* Rename ATTR_IDS to attrs.IDS. Rename ATTR_NAMES to attrs.NAMES. Rename UNIV_POS_IDS to parts_of_speech.IDS

											
										
										
											2015-10-10 09:55:55 +03:00
+								            return parts_of_speech.NAMES[self.c.pos]
-												Fix #2014: token.pos_ not writeable

											
										
										
											2018-03-27 22:21:11 +03:00
+								        def __set__(self, pos_name):
 								            self.c.pos = parts_of_speech.IDS[pos_name]
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property tag_:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (unicode): Fine-grained part-of-speech tag."""
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.

											
										
										
											2016-09-30 21:20:22 +03:00
+								            return self.vocab.strings[self.c.tag]
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
-												Fix Issue #600: Missing setters for Token attribute.

											
										
										
											2016-11-03 01:28:59 +03:00
+								        def __set__(self, tag):
-												Finish stringstore change. Also xfail vectors tests

											
										
										
											2017-05-28 16:10:22 +03:00
+								            self.tag = self.vocab.strings.add(tag)
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property dep_:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (unicode): The syntactic dependency label."""
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.

											
										
										
											2016-09-30 21:20:22 +03:00
+								            return self.vocab.strings[self.c.dep]
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								        def __set__(self, unicode label):
-												Finish stringstore change. Also xfail vectors tests

											
										
										
											2017-05-28 16:10:22 +03:00
+								            self.c.dep = self.vocab.strings.add(label)
-												* Break up tokens.pyx into tokens/doc.pyx, tokens/token.pyx, tokens/spans.pyx

											
										
										
											2015-07-13 21:20:58 +03:00
-												* Add is_oov property, and fix up handling of attributes

											
										
										
											2015-07-27 02:50:06 +03:00
+								    property is_oov:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (bool): Whether the token is out-of-vocabulary."""
 								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c.lex, IS_OOV)
-												* Add is_oov property, and fix up handling of attributes

											
										
										
											2015-07-27 02:50:06 +03:00
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
+								    property is_stop:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (bool): Whether the token is a stop word, i.e. part of a
 								            "stop list" defined by the language data.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c.lex, IS_STOP)
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
+								    property is_alpha:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (bool): Whether the token consists of alpha characters.
 								            Equivalent to `token.text.isalpha()`.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c.lex, IS_ALPHA)
-												* Whitespace

											
										
										
											2015-08-09 00:37:44 +03:00
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
+								    property is_ascii:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (bool): Whether the token consists of ASCII characters.
 								            Equivalent to `[any(ord(c) >= 128 for c in token.text)]`.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c.lex, IS_ASCII)
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
 								    property is_digit:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (bool): Whether the token consists of digits. Equivalent to
 								            `token.text.isdigit()`.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c.lex, IS_DIGIT)
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
 								    property is_lower:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (bool): Whether the token is in lowercase. Equivalent to
 								            `token.text.islower()`.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c.lex, IS_LOWER)
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								    property is_upper:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (bool): Whether the token is in uppercase. Equivalent to
 								            `token.text.isupper()`
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c.lex, IS_UPPER)
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
+								    property is_title:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (bool): Whether the token is in titlecase. Equivalent to
 								            `token.text.istitle()`.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c.lex, IS_TITLE)
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
 								    property is_punct:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (bool): Whether the token is punctuation."""
 								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c.lex, IS_PUNCT)
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
+								    property is_space:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (bool): Whether the token consists of whitespace characters.
 								            Equivalent to `token.text.isspace()`.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
 								    property is_bracket:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (bool): Whether the token is a bracket."""
 								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)
-												* Add Python hooks for is_bracket/is_quote/is_left_punct/is_right_punct

											
										
										
											2016-02-04 15:04:16 +03:00
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
+								    property is_quote:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (bool): Whether the token is a quotation mark."""
 								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)
-												* Add Python hooks for is_bracket/is_quote/is_left_punct/is_right_punct

											
										
										
											2016-02-04 15:04:16 +03:00
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
+								    property is_left_punct:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (bool): Whether the token is a left punctuation mark."""
 								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)
-												* Add Python hooks for is_bracket/is_quote/is_left_punct/is_right_punct

											
										
										
											2016-02-04 15:04:16 +03:00
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
+								    property is_right_punct:
-												Fix docstring for is_right_punct(). (#3044)


											
										
										
											2018-12-14 12:11:11 +03:00
+								        """RETURNS (bool): Whether the token is a right punctuation mark."""
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
-												added new lex feat to token

											
										
										
											2018-02-11 20:55:48 +03:00
+								    property is_currency:
 								        """RETURNS (bool): Whether the token is a currency symbol."""
 								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c.lex, IS_CURRENCY)
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
+								    property like_url:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (bool): Whether the token resembles a URL."""
 								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c.lex, LIKE_URL)
-												* Whitespace

											
										
										
											2015-08-09 00:37:44 +03:00
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
+								    property like_num:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (bool): Whether the token resembles a number, e.g. "10.9",
 								            "10", "ten", etc.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c.lex, LIKE_NUM)
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
 								    property like_email:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (bool): Whether the token resembles an email address."""
 								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL)