spaCy/spacy/tokens/token.pyx

# cython: infer_types=True
# Compiler crashes on memory view coercion without this. Should report bug.
from cython.view cimport array as cvarray
cimport numpy as np
np.import_array()

import numpy
from thinc.api import get_array_module
import warnings

from ..typedefs cimport hash_t
from ..lexeme cimport Lexeme
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
from ..attrs cimport IS_TITLE, IS_UPPER, IS_CURRENCY, IS_STOP
from ..attrs cimport LIKE_URL, LIKE_NUM, LIKE_EMAIL
from ..symbols cimport conj
from .morphanalysis cimport MorphAnalysis
from .doc cimport set_children_from_heads

from .. import parts_of_speech
from ..errors import Errors, Warnings
from .underscore import Underscore, get_ext_args


cdef class Token:
    """An individual token – i.e. a word, punctuation symbol, whitespace,
    etc.

    DOCS: https://spacy.io/api/token
    """
    @classmethod
    def set_extension(cls, name, **kwargs):
        """Define a custom attribute which becomes available as `Token._`.

        name (str): Name of the attribute to set.
        default: Optional default value of the attribute.
        getter (callable): Optional getter function.
        setter (callable): Optional setter function.
        method (callable): Optional method for method extension.
        force (bool): Force overwriting existing attribute.

        DOCS: https://spacy.io/api/token#set_extension
        USAGE: https://spacy.io/usage/processing-pipelines#custom-components-attributes
        """
        if cls.has_extension(name) and not kwargs.get("force", False):
            raise ValueError(Errors.E090.format(name=name, obj="Token"))
        Underscore.token_extensions[name] = get_ext_args(**kwargs)

    @classmethod
    def get_extension(cls, name):
        """Look up a previously registered extension by name.

        name (str): Name of the extension.
        RETURNS (tuple): A `(default, method, getter, setter)` tuple.

        DOCS: https://spacy.io/api/token#get_extension
        """
        return Underscore.token_extensions.get(name)

    @classmethod
    def has_extension(cls, name):
        """Check whether an extension has been registered.

        name (str): Name of the extension.
        RETURNS (bool): Whether the extension has been registered.

        DOCS: https://spacy.io/api/token#has_extension
        """
        return name in Underscore.token_extensions

    @classmethod
    def remove_extension(cls, name):
        """Remove a previously registered extension.

        name (str): Name of the extension.
        RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
            removed extension.

        DOCS: https://spacy.io/api/token#remove_extension
        """
        if not cls.has_extension(name):
            raise ValueError(Errors.E046.format(name=name))
        return Underscore.token_extensions.pop(name)

    def __cinit__(self, Vocab vocab, Doc doc, int offset):
        """Construct a `Token` object.

        vocab (Vocab): A storage container for lexical types.
        doc (Doc): The parent document.
        offset (int): The index of the token within the document.

        DOCS: https://spacy.io/api/token#init
        """
        self.vocab = vocab
        self.doc = doc
        self.c = &self.doc.c[offset]
        self.i = offset

    def __hash__(self):
        return hash((self.doc, self.i))

    def __len__(self):
        """The number of unicode characters in the token, i.e. `token.text`.

        RETURNS (int): The number of unicode characters in the token.

        DOCS: https://spacy.io/api/token#len
        """
        return self.c.lex.length

    def __unicode__(self):
        return self.text

    def __bytes__(self):
        return self.text.encode('utf8')

    def __str__(self):
        return self.__unicode__()

    def __repr__(self):
        return self.__str__()

    def __richcmp__(self, Token other, int op):
        # http://cython.readthedocs.io/en/latest/src/userguide/special_methods.html
        if other is None:
            if op in (0, 1, 2):
                return False
            else:
                return True
        cdef Doc my_doc = self.doc
        cdef Doc other_doc = other.doc
        my = self.idx
        their = other.idx
        if op == 0:
            return my < their
        elif op == 2:
            if my_doc is other_doc:
                return my == their
            else:
                return False
        elif op == 4:
            return my > their
        elif op == 1:
            return my <= their
        elif op == 3:
            if my_doc is other_doc:
                return my != their
            else:
                return True
        elif op == 5:
            return my >= their
        else:
            raise ValueError(Errors.E041.format(op=op))

    def __reduce__(self):
        raise NotImplementedError(Errors.E111)

    @property
    def _(self):
        """Custom extension attributes registered via `set_extension`."""
        return Underscore(Underscore.token_extensions, self,
                          start=self.idx, end=None)

    cpdef bint check_flag(self, attr_id_t flag_id) except -1:
        """Check the value of a boolean flag.

        flag_id (int): The ID of the flag attribute.
        RETURNS (bool): Whether the flag is set.

        DOCS: https://spacy.io/api/token#check_flag
        """
        return Lexeme.c_check_flag(self.c.lex, flag_id)

    def nbor(self, int i=1):
        """Get a neighboring token.

        i (int): The relative position of the token to get. Defaults to 1.
        RETURNS (Token): The token at position `self.doc[self.i+i]`.

        DOCS: https://spacy.io/api/token#nbor
        """
        if self.i+i < 0 or (self.i+i >= len(self.doc)):
            raise IndexError(Errors.E042.format(i=self.i, j=i, length=len(self.doc)))
        return self.doc[self.i+i]

    def similarity(self, other):
        """Make a semantic similarity estimate. The default estimate is cosine
        similarity using an average of word vectors.

        other (object): The object to compare with. By default, accepts `Doc`,
            `Span`, `Token` and `Lexeme` objects.
        RETURNS (float): A scalar similarity score. Higher is more similar.

        DOCS: https://spacy.io/api/token#similarity
        """
        if "similarity" in self.doc.user_token_hooks:
            return self.doc.user_token_hooks["similarity"](self, other)
        if hasattr(other, "__len__") and len(other) == 1 and hasattr(other, "__getitem__"):
            if self.c.lex.orth == getattr(other[0], "orth", None):
                return 1.0
        elif hasattr(other, "orth"):
            if self.c.lex.orth == other.orth:
                return 1.0
        if self.vocab.vectors.n_keys == 0:
            warnings.warn(Warnings.W007.format(obj="Token"))
        if self.vector_norm == 0 or other.vector_norm == 0:
            warnings.warn(Warnings.W008.format(obj="Token"))
            return 0.0
        vector = self.vector
        xp = get_array_module(vector)
        return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))

    def has_morph(self):
        """Check whether the token has annotated morph information.
        Return False when the morph annotation is unset/missing.

        RETURNS (bool): Whether the morph annotation is set.
        """
        return not self.c.morph == 0

    property morph:
        def __get__(self):
            return MorphAnalysis.from_id(self.vocab, self.c.morph)

        def __set__(self, MorphAnalysis morph):
            # Check that the morph has the same vocab
            if self.vocab != morph.vocab:
                raise ValueError(Errors.E1013)
            self.c.morph = morph.c.key

    def set_morph(self, features):
        cdef hash_t key
        if features is None:
            self.c.morph = 0
        elif isinstance(features, MorphAnalysis):
            self.morph = features
        else:
            if isinstance(features, int):
                features = self.vocab.strings[features]
            key = self.vocab.morphology.add(features)
            self.c.morph = key

    @property
    def lex(self):
        """RETURNS (Lexeme): The underlying lexeme."""
        return self.vocab[self.c.lex.orth]

    @property
    def lex_id(self):
        """RETURNS (int): Sequential ID of the token's lexical type."""
        return self.c.lex.id

    @property
    def rank(self):
        """RETURNS (int): Sequential ID of the token's lexical type, used to
        index into tables, e.g. for word vectors."""
        return self.c.lex.id

    @property
    def text(self):
        """RETURNS (str): The original verbatim text of the token."""
        return self.orth_

    @property
    def text_with_ws(self):
        """RETURNS (str): The text content of the span (with trailing
            whitespace).
        """
        cdef unicode orth = self.vocab.strings[self.c.lex.orth]
        if self.c.spacy:
            return orth + " "
        else:
            return orth

    @property
    def prob(self):
        """RETURNS (float): Smoothed log probability estimate of token type."""
        return self.vocab[self.c.lex.orth].prob

    @property
    def sentiment(self):
        """RETURNS (float): A scalar value indicating the positivity or
            negativity of the token."""
        if "sentiment" in self.doc.user_token_hooks:
            return self.doc.user_token_hooks["sentiment"](self)
        return self.vocab[self.c.lex.orth].sentiment

    @property
    def lang(self):
        """RETURNS (uint64): ID of the language of the parent document's
            vocabulary.
        """
        return self.c.lex.lang

    @property
    def idx(self):
        """RETURNS (int): The character offset of the token within the parent
            document.
        """
        return self.c.idx

    @property
    def cluster(self):
        """RETURNS (int): Brown cluster ID."""
        return self.vocab[self.c.lex.orth].cluster

    @property
    def orth(self):
        """RETURNS (uint64): ID of the verbatim text content."""
        return self.c.lex.orth

    @property
    def lower(self):
        """RETURNS (uint64): ID of the lowercase token text."""
        return self.c.lex.lower

    @property
    def norm(self):
        """RETURNS (uint64): ID of the token's norm, i.e. a normalised form of
            the token text. Usually set in the language's tokenizer exceptions
            or norm exceptions.
        """
        if self.c.norm == 0:
            return self.c.lex.norm
        else:
            return self.c.norm

    @property
    def shape(self):
        """RETURNS (uint64): ID of the token's shape, a transform of the
            token's string, to show orthographic features (e.g. "Xxxx", "dd").
        """
        return self.c.lex.shape

    @property
    def prefix(self):
        """RETURNS (uint64): ID of a length-N substring from the start of the
            token. Defaults to `N=1`.
        """
        return self.c.lex.prefix

    @property
    def suffix(self):
        """RETURNS (uint64): ID of a length-N substring from the end of the
            token. Defaults to `N=3`.
        """
        return self.c.lex.suffix

    property lemma:
        """RETURNS (uint64): ID of the base form of the word, with no
            inflectional suffixes.
        """
        def __get__(self):
            return self.c.lemma

        def __set__(self, attr_t lemma):
            self.c.lemma = lemma

    property pos:
        """RETURNS (uint64): ID of coarse-grained part-of-speech tag."""
        def __get__(self):
            return self.c.pos

        def __set__(self, pos):
            self.c.pos = pos

    property tag:
        """RETURNS (uint64): ID of fine-grained part-of-speech tag."""
        def __get__(self):
            return self.c.tag

        def __set__(self, attr_t tag):
            self.c.tag = tag

    property dep:
        """RETURNS (uint64): ID of syntactic dependency label."""
        def __get__(self):
            return self.c.dep

        def __set__(self, attr_t label):
            self.c.dep = label

    @property
    def has_vector(self):
        """A boolean value indicating whether a word vector is associated with
        the object.

        RETURNS (bool): Whether a word vector is associated with the object.

        DOCS: https://spacy.io/api/token#has_vector
        """
        if "has_vector" in self.doc.user_token_hooks:
            return self.doc.user_token_hooks["has_vector"](self)
        if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
            return True
        return self.vocab.has_vector(self.c.lex.orth)

    @property
    def vector(self):
        """A real-valued meaning representation.

        RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
            representing the token's semantics.

        DOCS: https://spacy.io/api/token#vector
        """
        if "vector" in self.doc.user_token_hooks:
            return self.doc.user_token_hooks["vector"](self)
        if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
            return self.doc.tensor[self.i]
        else:
            return self.vocab.get_vector(self.c.lex.orth)

    @property
    def vector_norm(self):
        """The L2 norm of the token's vector representation.

        RETURNS (float): The L2 norm of the vector representation.

        DOCS: https://spacy.io/api/token#vector_norm
        """
        if "vector_norm" in self.doc.user_token_hooks:
            return self.doc.user_token_hooks["vector_norm"](self)
        vector = self.vector
        xp = get_array_module(vector)
        total = (vector ** 2).sum()
        return xp.sqrt(total) if total != 0. else 0.

    @property
    def tensor(self):
        if self.doc.tensor is None:
            return None
        return self.doc.tensor[self.i]

    @property
    def n_lefts(self):
        """The number of leftward immediate children of the word, in the
        syntactic dependency parse.

        RETURNS (int): The number of leftward immediate children of the
            word, in the syntactic dependency parse.

        DOCS: https://spacy.io/api/token#n_lefts
        """
        return self.c.l_kids

    @property
    def n_rights(self):
        """The number of rightward immediate children of the word, in the
        syntactic dependency parse.

        RETURNS (int): The number of rightward immediate children of the
            word, in the syntactic dependency parse.

        DOCS: https://spacy.io/api/token#n_rights
        """
        return self.c.r_kids

    @property
    def sent(self):
        """RETURNS (Span): The sentence span that the token is a part of."""
        if 'sent' in self.doc.user_token_hooks:
            return self.doc.user_token_hooks["sent"](self)
        return self.doc[self.i : self.i+1].sent

    property sent_start:
        def __get__(self):
            """Deprecated: use Token.is_sent_start instead."""
            # Raising a deprecation warning here causes errors for autocomplete
            # Handle broken backwards compatibility case: doc[0].sent_start
            # was False.
            if self.i == 0:
                return False
            else:
                return self.c.sent_start

        def __set__(self, value):
            self.is_sent_start = value

    property is_sent_start:
        """A boolean value indicating whether the token starts a sentence.
        `None` if unknown. Defaults to `True` for the first token in the `Doc`.

        RETURNS (bool / None): Whether the token starts a sentence.
            None if unknown.

        DOCS: https://spacy.io/api/token#is_sent_start
        """
        def __get__(self):
            if self.c.sent_start == 0:
                return None
            elif self.c.sent_start < 0:
                return False
            else:
                return True

        def __set__(self, value):
            if self.doc.has_annotation("DEP"):
                raise ValueError(Errors.E043)
            if value is None:
                self.c.sent_start = 0
            elif value is True:
                self.c.sent_start = 1
            elif value is False:
                self.c.sent_start = -1
            else:
                raise ValueError(Errors.E044.format(value=value))

    property is_sent_end:
        """A boolean value indicating whether the token ends a sentence.
        `None` if unknown. Defaults to `True` for the last token in the `Doc`.

        RETURNS (bool / None): Whether the token ends a sentence.
            None if unknown.

        DOCS: https://spacy.io/api/token#is_sent_end
        """
        def __get__(self):
            if self.i + 1 == len(self.doc):
                return True
            elif self.doc[self.i+1].is_sent_start == None:
                return None
            elif self.doc[self.i+1].is_sent_start == True:
                return True
            else:
                return False

        def __set__(self, value):
            raise ValueError(Errors.E196)

    @property
    def lefts(self):
        """The leftward immediate children of the word, in the syntactic
        dependency parse.

        YIELDS (Token): A left-child of the token.

        DOCS: https://spacy.io/api/token#lefts
        """
        cdef int nr_iter = 0
        cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge)
        while ptr < self.c:
            if ptr + ptr.head == self.c:
                yield self.doc[ptr - (self.c - self.i)]
            ptr += 1
            nr_iter += 1
            # This is ugly, but it's a way to guard out infinite loops
            if nr_iter >= 10000000:
                raise RuntimeError(Errors.E045.format(attr="token.lefts"))

    @property
    def rights(self):
        """The rightward immediate children of the word, in the syntactic
        dependency parse.

        YIELDS (Token): A right-child of the token.

        DOCS: https://spacy.io/api/token#rights
        """
        cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
        tokens = []
        cdef int nr_iter = 0
        while ptr > self.c:
            if ptr + ptr.head == self.c:
                tokens.append(self.doc[ptr - (self.c - self.i)])
            ptr -= 1
            nr_iter += 1
            if nr_iter >= 10000000:
                raise RuntimeError(Errors.E045.format(attr="token.rights"))
        tokens.reverse()
        for t in tokens:
            yield t

    @property
    def children(self):
        """A sequence of the token's immediate syntactic children.

        YIELDS (Token): A child token such that `child.head==self`.

        DOCS: https://spacy.io/api/token#children
        """
        yield from self.lefts
        yield from self.rights

    @property
    def subtree(self):
        """A sequence containing the token and all the token's syntactic
        descendants.

        YIELDS (Token): A descendent token such that
            `self.is_ancestor(descendent) or token == self`.

        DOCS: https://spacy.io/api/token#subtree
        """
        for word in self.lefts:
            yield from word.subtree
        yield self
        for word in self.rights:
            yield from word.subtree

    @property
    def left_edge(self) -> int:
        """The leftmost token of this token's syntactic descendents.

        RETURNS (Token): The first token such that `self.is_ancestor(token)`.
        """
        return self.doc[self.c.l_edge]

    @property
    def right_edge(self) -> int:
        """The rightmost token of this token's syntactic descendents.

        RETURNS (Token): The last token such that `self.is_ancestor(token)`.
        """
        return self.doc[self.c.r_edge]

    @property
    def ancestors(self):
        """A sequence of this token's syntactic ancestors.

        YIELDS (Token): A sequence of ancestor tokens such that
            `ancestor.is_ancestor(self)`.

        DOCS: https://spacy.io/api/token#ancestors
        """
        cdef const TokenC* head_ptr = self.c
        # Guard against infinite loop, no token can have
        # more ancestors than tokens in the tree.
        cdef int i = 0
        while head_ptr.head != 0 and i < self.doc.length:
            head_ptr += head_ptr.head
            yield self.doc[head_ptr - (self.c - self.i)]
            i += 1

    def is_ancestor(self, descendant):
        """Check whether this token is a parent, grandparent, etc. of another
        in the dependency tree.

        descendant (Token): Another token.
        RETURNS (bool): Whether this token is the ancestor of the descendant.

        DOCS: https://spacy.io/api/token#is_ancestor
        """
        if self.doc is not descendant.doc:
            return False
        return any(ancestor.i == self.i for ancestor in descendant.ancestors)

    def has_head(self):
        """Check whether the token has annotated head information.
        Return False when the head annotation is unset/missing.

        RETURNS (bool): Whether the head annotation is valid or not.
        """
        return not Token.missing_head(self.c)

    property head:
        """The syntactic parent, or "governor", of this token.
        If token.has_head() is `False`, this method will return itself.

        RETURNS (Token): The token predicted by the parser to be the head of
            the current token.
        """
        def __get__(self):
            if not self.has_head():
                return self
            else:
                return self.doc[self.i + self.c.head]

        def __set__(self, Token new_head):
            # This function sets the head of self to new_head and updates the
            # counters for left/right dependents and left/right corner for the
            # new and the old head
            # Check that token is from the same document
            if self.doc != new_head.doc:
                raise ValueError(Errors.E191)
            # Do nothing if old head is new head
            if self.i + self.c.head == new_head.i:
                return
            # Find the widest l/r_edges of the roots of the two tokens involved
            # to limit the number of tokens for set_children_from_heads
            cdef Token self_root, new_head_root
            self_root = ([self] + list(self.ancestors))[-1]
            new_head_ancestors = list(new_head.ancestors)
            new_head_root = new_head_ancestors[-1] if new_head_ancestors else new_head
            start = self_root.c.l_edge if self_root.c.l_edge < new_head_root.c.l_edge else new_head_root.c.l_edge
            end = self_root.c.r_edge if self_root.c.r_edge > new_head_root.c.r_edge else new_head_root.c.r_edge
            # Set new head
            self.c.head = new_head.i - self.i
            # Adjust parse properties and sentence starts
            set_children_from_heads(self.doc.c, start, end + 1)

    @property
    def conjuncts(self):
        """A sequence of coordinated tokens, including the token itself.

        RETURNS (tuple): The coordinated tokens.

        DOCS: https://spacy.io/api/token#conjuncts
        """
        cdef Token word, child
        if "conjuncts" in self.doc.user_token_hooks:
            return tuple(self.doc.user_token_hooks["conjuncts"](self))
        start = self
        while start.i != start.head.i:
            if start.dep == conj:
                start = start.head
            else:
                break
        queue = [start]
        output = [start]
        for word in queue:
            for child in word.rights:
                if child.c.dep == conj:
                    output.append(child)
                    queue.append(child)
        return tuple([w for w in output if w.i != self.i])

    property ent_type:
        """RETURNS (uint64): Named entity type."""
        def __get__(self):
            return self.c.ent_type

        def __set__(self, ent_type):
            self.c.ent_type = ent_type

    property ent_type_:
        """RETURNS (str): Named entity type."""
        def __get__(self):
            return self.vocab.strings[self.c.ent_type]

        def __set__(self, ent_type):
            self.c.ent_type = self.vocab.strings.add(ent_type)

    @property
    def ent_iob(self):
        """IOB code of named entity tag. `1="I", 2="O", 3="B"`. 0 means no tag
        is assigned.

        RETURNS (uint64): IOB code of named entity tag.
        """
        return self.c.ent_iob

    @classmethod
    def iob_strings(cls):
        return ("", "I", "O", "B")

    @property
    def ent_iob_(self):
        """IOB code of named entity tag. "B" means the token begins an entity,
        "I" means it is inside an entity, "O" means it is outside an entity,
        and "" means no entity tag is set. "B" with an empty ent_type
        means that the token is blocked from further processing by NER.

        RETURNS (str): IOB code of named entity tag.
        """
        return self.iob_strings()[self.c.ent_iob]

    property ent_id:
        """RETURNS (uint64): ID of the entity the token is an instance of,
            if any.
        """
        def __get__(self):
            return self.c.ent_id

        def __set__(self, hash_t key):
            self.c.ent_id = key

    property ent_id_:
        """RETURNS (str): ID of the entity the token is an instance of,
            if any.
        """
        def __get__(self):
            return self.vocab.strings[self.c.ent_id]

        def __set__(self, name):
            self.c.ent_id = self.vocab.strings.add(name)

    property ent_kb_id:
        """RETURNS (uint64): Named entity KB ID."""
        def __get__(self):
            return self.c.ent_kb_id

        def __set__(self, attr_t ent_kb_id):
            self.c.ent_kb_id = ent_kb_id

    property ent_kb_id_:
        """RETURNS (str): Named entity KB ID."""
        def __get__(self):
            return self.vocab.strings[self.c.ent_kb_id]

        def __set__(self, ent_kb_id):
            self.c.ent_kb_id = self.vocab.strings.add(ent_kb_id)

    @property
    def whitespace_(self):
        """RETURNS (str): The trailing whitespace character, if present."""
        return " " if self.c.spacy else ""

    @property
    def orth_(self):
        """RETURNS (str): Verbatim text content (identical to
            `Token.text`). Exists mostly for consistency with the other
            attributes.
        """
        return self.vocab.strings[self.c.lex.orth]

    @property
    def lower_(self):
        """RETURNS (str): The lowercase token text. Equivalent to
            `Token.text.lower()`.
        """
        return self.vocab.strings[self.c.lex.lower]

    property norm_:
        """RETURNS (str): The token's norm, i.e. a normalised form of the
            token text. Usually set in the language's tokenizer exceptions or
            norm exceptions.
        """
        def __get__(self):
            return self.vocab.strings[self.norm]

        def __set__(self, unicode norm_):
            self.c.norm = self.vocab.strings.add(norm_)

    @property
    def shape_(self):
        """RETURNS (str): Transform of the token's string, to show
            orthographic features. For example, "Xxxx" or "dd".
        """
        return self.vocab.strings[self.c.lex.shape]

    @property
    def prefix_(self):
        """RETURNS (str): A length-N substring from the start of the token.
            Defaults to `N=1`.
        """
        return self.vocab.strings[self.c.lex.prefix]

    @property
    def suffix_(self):
        """RETURNS (str): A length-N substring from the end of the token.
            Defaults to `N=3`.
        """
        return self.vocab.strings[self.c.lex.suffix]

    @property
    def lang_(self):
        """RETURNS (str): Language of the parent document's vocabulary,
            e.g. 'en'.
        """
        return self.vocab.strings[self.c.lex.lang]

    property lemma_:
        """RETURNS (str): The token lemma, i.e. the base form of the word,
            with no inflectional suffixes.
        """
        def __get__(self):
            return self.vocab.strings[self.c.lemma]

        def __set__(self, unicode lemma_):
            self.c.lemma = self.vocab.strings.add(lemma_)

    property pos_:
        """RETURNS (str): Coarse-grained part-of-speech tag."""
        def __get__(self):
            return parts_of_speech.NAMES[self.c.pos]

        def __set__(self, pos_name):
            if pos_name not in parts_of_speech.IDS:
                raise ValueError(Errors.E1021.format(pp=pos_name))
            self.c.pos = parts_of_speech.IDS[pos_name]

    property tag_:
        """RETURNS (str): Fine-grained part-of-speech tag."""
        def __get__(self):
            return self.vocab.strings[self.c.tag]

        def __set__(self, tag):
            self.tag = self.vocab.strings.add(tag)

    def has_dep(self):
        """Check whether the token has annotated dep information.
        Returns False when the dep label is unset/missing.

        RETURNS (bool): Whether the dep label is valid or not.
        """
        return not Token.missing_dep(self.c)

    property dep_:
        """RETURNS (str): The syntactic dependency label."""
        def __get__(self):
            return self.vocab.strings[self.c.dep]

        def __set__(self, unicode label):
            self.c.dep = self.vocab.strings.add(label)

    @property
    def is_oov(self):
        """RETURNS (bool): Whether the token is out-of-vocabulary."""
        return self.c.lex.orth not in self.vocab.vectors

    @property
    def is_stop(self):
        """RETURNS (bool): Whether the token is a stop word, i.e. part of a
            "stop list" defined by the language data.
        """
        return Lexeme.c_check_flag(self.c.lex, IS_STOP)

    @property
    def is_alpha(self):
        """RETURNS (bool): Whether the token consists of alpha characters.
            Equivalent to `token.text.isalpha()`.
        """
        return Lexeme.c_check_flag(self.c.lex, IS_ALPHA)

    @property
    def is_ascii(self):
        """RETURNS (bool): Whether the token consists of ASCII characters.
            Equivalent to `[any(ord(c) >= 128 for c in token.text)]`.
        """
        return Lexeme.c_check_flag(self.c.lex, IS_ASCII)

    @property
    def is_digit(self):
        """RETURNS (bool): Whether the token consists of digits. Equivalent to
            `token.text.isdigit()`.
        """
        return Lexeme.c_check_flag(self.c.lex, IS_DIGIT)

    @property
    def is_lower(self):
        """RETURNS (bool): Whether the token is in lowercase. Equivalent to
            `token.text.islower()`.
        """
        return Lexeme.c_check_flag(self.c.lex, IS_LOWER)

    @property
    def is_upper(self):
        """RETURNS (bool): Whether the token is in uppercase. Equivalent to
            `token.text.isupper()`
        """
        return Lexeme.c_check_flag(self.c.lex, IS_UPPER)

    @property
    def is_title(self):
        """RETURNS (bool): Whether the token is in titlecase. Equivalent to
            `token.text.istitle()`.
        """
        return Lexeme.c_check_flag(self.c.lex, IS_TITLE)

    @property
    def is_punct(self):
        """RETURNS (bool): Whether the token is punctuation."""
        return Lexeme.c_check_flag(self.c.lex, IS_PUNCT)

    @property
    def is_space(self):
        """RETURNS (bool): Whether the token consists of whitespace characters.
            Equivalent to `token.text.isspace()`.
        """
        return Lexeme.c_check_flag(self.c.lex, IS_SPACE)

    @property
    def is_bracket(self):
        """RETURNS (bool): Whether the token is a bracket."""
        return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)

    @property
    def is_quote(self):
        """RETURNS (bool): Whether the token is a quotation mark."""
        return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)

    @property
    def is_left_punct(self):
        """RETURNS (bool): Whether the token is a left punctuation mark."""
        return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)

    @property
    def is_right_punct(self):
        """RETURNS (bool): Whether the token is a right punctuation mark."""
        return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)

    @property
    def is_currency(self):
        """RETURNS (bool): Whether the token is a currency symbol."""
        return Lexeme.c_check_flag(self.c.lex, IS_CURRENCY)

    @property
    def like_url(self):
        """RETURNS (bool): Whether the token resembles a URL."""
        return Lexeme.c_check_flag(self.c.lex, LIKE_URL)

    @property
    def like_num(self):
        """RETURNS (bool): Whether the token resembles a number, e.g. "10.9",
            "10", "ten", etc.
        """
        return Lexeme.c_check_flag(self.c.lex, LIKE_NUM)

    @property
    def like_email(self):
        """RETURNS (bool): Whether the token resembles an email address."""
        return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL)
-												Fix issue #672: ent_iob_ was a string, not unicode, due to missing unicode_literals statement.

											
										
										
											2016-12-19 00:33:53 +03:00
+								# cython: infer_types=True
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								# Compiler crashes on memory view coercion without this. Should report bug.
 								from cython.view cimport array as cvarray
-												* Break up tokens.pyx into tokens/doc.pyx, tokens/token.pyx, tokens/spans.pyx

											
										
										
											2015-07-13 21:20:58 +03:00
+								cimport numpy as np
 								np.import_array()
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
-												* Break up tokens.pyx into tokens/doc.pyx, tokens/token.pyx, tokens/spans.pyx

											
										
										
											2015-07-13 21:20:58 +03:00
+								import numpy
-												Tidy up and auto-format

											
										
										
											2020-02-18 17:38:18 +03:00
+								from thinc.api import get_array_module
-												Simplify warnings

											
										
										
											2020-04-28 14:37:37 +03:00
+								import warnings
-												* Break up tokens.pyx into tokens/doc.pyx, tokens/token.pyx, tokens/spans.pyx

											
										
										
											2015-07-13 21:20:58 +03:00
-												Import hash_t typedef in token.pyx

											
										
										
											2016-09-23 15:22:06 +03:00
+								from ..typedefs cimport hash_t
-												* Begin merge of Gazetteer and DE branches

											
										
										
											2015-09-06 20:45:15 +03:00
+								from ..lexeme cimport Lexeme
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
+								from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
-												Clean up spacy.tokens (#6046)

* Clean up spacy.tokens

* Update `set_children_from_heads`:
  * Don't check `dep` when setting lr_* or sentence starts
  * Set all non-sentence starts to `False`

* Use `set_children_from_heads` in `Token.head` setter
  * Reduce similar/duplicate code (admittedly adds a bit of overhead)
  * Update sentence starts consistently

* Remove unused `Doc.set_parse`

* Minor changes:
  * Declare cython variables (to avoid cython warnings)
  * Clean up imports

* Modify set_children_from_heads to set token range

Modify `set_children_from_heads` so that it adjust tokens within a
specified range rather then the whole document.

Modify the `Token.head` setter to adjust only the tokens affected by the
new head assignment.
											
										
										
											2020-09-16 21:32:38 +03:00
+								from ..attrs cimport IS_TITLE, IS_UPPER, IS_CURRENCY, IS_STOP
 								from ..attrs cimport LIKE_URL, LIKE_NUM, LIKE_EMAIL
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								from ..symbols cimport conj
-												Tidy up compiler flags and imports (#5071)


											
										
										
											2020-03-02 13:48:10 +03:00
+								from .morphanalysis cimport MorphAnalysis
-												Clean up spacy.tokens (#6046)

* Clean up spacy.tokens

* Update `set_children_from_heads`:
  * Don't check `dep` when setting lr_* or sentence starts
  * Set all non-sentence starts to `False`

* Use `set_children_from_heads` in `Token.head` setter
  * Reduce similar/duplicate code (admittedly adds a bit of overhead)
  * Update sentence starts consistently

* Remove unused `Doc.set_parse`

* Minor changes:
  * Declare cython variables (to avoid cython warnings)
  * Clean up imports

* Modify set_children_from_heads to set token range

Modify `set_children_from_heads` so that it adjust tokens within a
specified range rather then the whole document.

Modify the `Token.head` setter to adjust only the tokens affected by the
new head assignment.
											
										
										
											2020-09-16 21:32:38 +03:00
+								from .doc cimport set_children_from_heads
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
 								from .. import parts_of_speech
-												Simplify warnings

											
										
										
											2020-04-28 14:37:37 +03:00
+								from ..errors import Errors, Warnings
-												Don't raise error if set_extension has getter and setter (closes #2177)

Improve error messages, raise error if setter is specified without a getter and compare against _unset to allow default=None. Also add more tests.

											
										
										
											2018-04-03 19:30:17 +03:00
+								from .underscore import Underscore, get_ext_args
-												* Work on language-independent refactoring

											
										
										
											2015-08-23 21:49:18 +03:00
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								cdef class Token:
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								    """An individual token – i.e. a word, punctuation symbol, whitespace,
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								    etc.
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								    DOCS: https://spacy.io/api/token
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								    """
-												Pass extensions into Underscore class

											
										
										
											2017-10-07 19:56:01 +03:00
+								    @classmethod
-												Don't raise error if set_extension has getter and setter (closes #2177)

Improve error messages, raise error if setter is specified without a getter and compare against _unset to allow default=None. Also add more tests.

											
										
										
											2018-04-03 19:30:17 +03:00
+								    def set_extension(cls, name, **kwargs):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        """Define a custom attribute which becomes available as `Token._`.
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        name (str): Name of the attribute to set.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        default: Optional default value of the attribute.
 								        getter (callable): Optional getter function.
 								        setter (callable): Optional setter function.
 								        method (callable): Optional method for method extension.
 								        force (bool): Force overwriting existing attribute.
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/token#set_extension
 								        USAGE: https://spacy.io/usage/processing-pipelines#custom-components-attributes
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        """
 								        if cls.has_extension(name) and not kwargs.get("force", False):
 								            raise ValueError(Errors.E090.format(name=name, obj="Token"))
-												Don't raise error if set_extension has getter and setter (closes #2177)

Improve error messages, raise error if setter is specified without a getter and compare against _unset to allow default=None. Also add more tests.

											
										
										
											2018-04-03 19:30:17 +03:00
+								        Underscore.token_extensions[name] = get_ext_args(**kwargs)
-												Pass extensions into Underscore class

											
										
										
											2017-10-07 19:56:01 +03:00
 								    @classmethod
 								    def get_extension(cls, name):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        """Look up a previously registered extension by name.
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        name (str): Name of the extension.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        RETURNS (tuple): A `(default, method, getter, setter)` tuple.
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/token#get_extension
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        """
-												Fix Token.set_extension

											
										
										
											2018-04-29 16:48:19 +03:00
+								        return Underscore.token_extensions.get(name)
-												Pass extensions into Underscore class

											
										
										
											2017-10-07 19:56:01 +03:00
 								    @classmethod
 								    def has_extension(cls, name):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        """Check whether an extension has been registered.
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        name (str): Name of the extension.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        RETURNS (bool): Whether the extension has been registered.
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/token#has_extension
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        """
-												Fix Token.set_extension

											
										
										
											2018-04-29 16:48:19 +03:00
+								        return name in Underscore.token_extensions
-												Pass extensions into Underscore class

											
										
										
											2017-10-07 19:56:01 +03:00
-												Add remove_extension method on Doc, Token and Span (closes #2242)

											
										
										
											2018-04-29 00:33:09 +03:00
+								    @classmethod
 								    def remove_extension(cls, name):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        """Remove a previously registered extension.
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        name (str): Name of the extension.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
 								            removed extension.
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/token#remove_extension
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        """
-												Add remove_extension method on Doc, Token and Span (closes #2242)

											
										
										
											2018-04-29 00:33:09 +03:00
+								        if not cls.has_extension(name):
 								            raise ValueError(Errors.E046.format(name=name))
 								        return Underscore.token_extensions.pop(name)
-												* Go back to having token reference doc, instead of complicated gymnastics. Rename the attr 'doc', to expose it in the API

											
										
										
											2015-07-14 01:10:11 +03:00
+								    def __cinit__(self, Vocab vocab, Doc doc, int offset):
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """Construct a `Token` object.
 								        vocab (Vocab): A storage container for lexical types.
 								        doc (Doc): The parent document.
 								        offset (int): The index of the token within the document.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/token#init
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        self.vocab = vocab
-												* Go back to having token reference doc, instead of complicated gymnastics. Rename the attr 'doc', to expose it in the API

											
										
										
											2015-07-14 01:10:11 +03:00
+								        self.doc = doc
-												* Rename Doc.data to Doc.c

											
										
										
											2015-11-03 16:15:14 +03:00
+								        self.c = &self.doc.c[offset]
-												* Go back to having token reference doc, instead of complicated gymnastics. Rename the attr 'doc', to expose it in the API

											
										
										
											2015-07-14 01:10:11 +03:00
+								        self.i = offset
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												Make Token hashable. Fixes #743

											
										
										
											2017-01-16 15:27:57 +03:00
+								    def __hash__(self):
 								        return hash((self.doc, self.i))
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								    def __len__(self):
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """The number of unicode characters in the token, i.e. `token.text`.
 								        RETURNS (int): The number of unicode characters in the token.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/token#len
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        return self.c.lex.length
 								    def __unicode__(self):
-												Remove deprecation shim around str/bytes in Token.

											
										
										
											2016-10-17 15:02:47 +03:00
+								        return self.text
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												fixed error when printing unicode

											
										
										
											2015-11-02 21:22:18 +03:00
+								    def __bytes__(self):
-												Remove deprecation shim around str/bytes in Token.

											
										
										
											2016-10-17 15:02:47 +03:00
+								        return self.text.encode('utf8')
-												fixed error when printing unicode

											
										
										
											2015-11-02 21:22:18 +03:00
-												* Fix string coercion for Python 3

											
										
										
											2015-07-24 04:49:30 +03:00
+								    def __str__(self):
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 03:53:56 +03:00
+								        return self.__unicode__()
-												* Fix string coercion for Python 3

											
										
										
											2015-07-24 04:49:30 +03:00
-												added __repr__ that prints text in ipython for doc, token, and span objects

											
										
										
											2015-10-21 14:11:46 +03:00
+								    def __repr__(self):
-												fixed error when printing unicode

											
										
										
											2015-11-02 21:22:18 +03:00
+								        return self.__str__()
-												added __repr__ that prints text in ipython for doc, token, and span objects

											
										
										
											2015-10-21 14:11:46 +03:00
-												Amend 8ae8b443f: Handle comparison with None tokens.

											
										
										
											2017-01-11 15:03:32 +03:00
+								    def __richcmp__(self, Token other, int op):
-												Add richcmp method to Token. Closes #631

											
										
										
											2017-01-09 21:30:31 +03:00
+								        # http://cython.readthedocs.io/en/latest/src/userguide/special_methods.html
-												Fix rich comparison against None objects. Closes #1757

											
										
										
											2018-01-15 17:51:25 +03:00
+								        if other is None:
 								            if op in (0, 1, 2):
 								                return False
 								            else:
 								                return True
-												Fix comparison of Token from different docs. Closes #1257

											
										
										
											2017-08-19 17:39:32 +03:00
+								        cdef Doc my_doc = self.doc
 								        cdef Doc other_doc = other.doc
-												Add richcmp method to Token. Closes #631

											
										
										
											2017-01-09 21:30:31 +03:00
+								        my = self.idx
-												Fix rich comparison against None objects. Closes #1757

											
										
										
											2018-01-15 17:51:25 +03:00
+								        their = other.idx
-												Add richcmp method to Token. Closes #631

											
										
										
											2017-01-09 21:30:31 +03:00
+								        if op == 0:
 								            return my < their
 								        elif op == 2:
-												Fix comparison of Token from different docs. Closes #1257

											
										
										
											2017-08-19 17:39:32 +03:00
+								            if my_doc is other_doc:
 								                return my == their
 								            else:
 								                return False
-												Add richcmp method to Token. Closes #631

											
										
										
											2017-01-09 21:30:31 +03:00
+								        elif op == 4:
 								            return my > their
 								        elif op == 1:
 								            return my <= their
 								        elif op == 3:
-												Fix comparison of Token from different docs. Closes #1257

											
										
										
											2017-08-19 17:39:32 +03:00
+								            if my_doc is other_doc:
 								                return my != their
 								            else:
 								                return True
-												Add richcmp method to Token. Closes #631

											
										
										
											2017-01-09 21:30:31 +03:00
+								        elif op == 5:
 								            return my >= their
 								        else:
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								            raise ValueError(Errors.E041.format(op=op))
-												Add richcmp method to Token. Closes #631

											
										
										
											2017-01-09 21:30:31 +03:00
-												Raise better error if token is pickled (resolves #2833) (#3267)


											
										
										
											2019-02-13 13:27:04 +03:00
+								    def __reduce__(self):
 								        raise NotImplementedError(Errors.E111)
-												Pass extensions into Underscore class

											
										
										
											2017-10-07 19:56:01 +03:00
+								    @property
 								    def _(self):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        """Custom extension attributes registered via `set_extension`."""
-												Pass extensions into Underscore class

											
										
										
											2017-10-07 19:56:01 +03:00
+								        return Underscore(Underscore.token_extensions, self,
 								                          start=self.idx, end=None)
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								    cpdef bint check_flag(self, attr_id_t flag_id) except -1:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """Check the value of a boolean flag.
 								        flag_id (int): The ID of the flag attribute.
 								        RETURNS (bool): Whether the flag is set.
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/token#check_flag
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												* Fix ugly py_check_flag and py_set_flag functions in Lexeme

											
										
										
											2015-09-15 06:06:18 +03:00
+								        return Lexeme.c_check_flag(self.c.lex, flag_id)
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    def nbor(self, int i=1):
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """Get a neighboring token.
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        i (int): The relative position of the token to get. Defaults to 1.
 								        RETURNS (Token): The token at position `self.doc[self.i+i]`.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/token#nbor
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												Fix #1375 -- out-of-bounds on token.nbor()

											
										
										
											2017-10-24 13:10:39 +03:00
+								        if self.i+i < 0 or (self.i+i >= len(self.doc)):
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								            raise IndexError(Errors.E042.format(i=self.i, j=i, length=len(self.doc)))
-												* Go back to having token reference doc, instead of complicated gymnastics. Rename the attr 'doc', to expose it in the API

											
										
										
											2015-07-14 01:10:11 +03:00
+								        return self.doc[self.i+i]
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
+								    def similarity(self, other):
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """Make a semantic similarity estimate. The default estimate is cosine
 								        similarity using an average of word vectors.
 								        other (object): The object to compare with. By default, accepts `Doc`,
 								            `Span`, `Token` and `Lexeme` objects.
 								        RETURNS (float): A scalar similarity score. Higher is more similar.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/token#similarity
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        if "similarity" in self.doc.user_token_hooks:
-												Fix bug in Token.similarity when called via hook

											
										
										
											2019-07-27 16:26:01 +03:00
+								            return self.doc.user_token_hooks["similarity"](self, other)
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        if hasattr(other, "__len__") and len(other) == 1 and hasattr(other, "__getitem__"):
 								            if self.c.lex.orth == getattr(other[0], "orth", None):
-												Make .similarity() return 1.0 if all orth attrs match

											
										
										
											2018-01-15 18:29:48 +03:00
+								                return 1.0
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        elif hasattr(other, "orth"):
-												Make .similarity() return 1.0 if all orth attrs match

											
										
										
											2018-01-15 18:29:48 +03:00
+								            if self.c.lex.orth == other.orth:
 								                return 1.0
-												💫 Add .similarity warnings for no vectors and option to exclude warnings (#2197)

* Add logic to filter out warning IDs via environment variable

Usage: SPACY_WARNING_EXCLUDE=W001,W007

* Add warnings for empty vectors

* Add warning if no word vectors are used in .similarity methods

For example, if only tensors are available in small models – should hopefully clear up some confusion around this

* Capture warnings in tests

* Rename SPACY_WARNING_EXCLUDE to SPACY_WARNING_IGNORE

											
										
										
											2018-05-21 02:22:38 +03:00
+								        if self.vocab.vectors.n_keys == 0:
-												Simplify warnings

											
										
										
											2020-04-28 14:37:37 +03:00
+								            warnings.warn(Warnings.W007.format(obj="Token"))
-												* Fix vectors bugs for OOV words

											
										
										
											2015-09-22 03:10:01 +03:00
+								        if self.vector_norm == 0 or other.vector_norm == 0:
-												Simplify warnings

											
										
										
											2020-04-28 14:37:37 +03:00
+								            warnings.warn(Warnings.W008.format(obj="Token"))
-												* Fix vectors bugs for OOV words

											
										
										
											2015-09-22 03:10:01 +03:00
+								            return 0.0
-												Don't use numpy directly for similarity (#3362)

* Don't use numpy directly for similarity

* Contributor agreement

											
										
										
											2019-03-07 01:58:38 +03:00
+								        vector = self.vector
-												Merge branch 'master' into develop

											
										
										
											2019-03-07 02:56:31 +03:00
+								        xp = get_array_module(vector)
-												Tidy up and only use self.vector once

											
										
										
											2019-03-07 03:06:12 +03:00
+								        return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
-												Handle unset token.morph in Morphologizer (#6704)

* Handle unset token.morph in Morphologizer

Handle unset `token.morph` in `Morphologizer.initialize` and
`Morphologizer.get_loss`. If both `token.morph` and `token.pos` are
unset, treat the annotation as missing rather than empty.

* Add token.has_morph()
											
										
										
											2021-01-15 19:20:10 +03:00
+								    def has_morph(self):
 								        """Check whether the token has annotated morph information.
 								        Return False when the morph annotation is unset/missing.
 								        RETURNS (bool): Whether the morph annotation is set.
 								        """
 								        return not self.c.morph == 0
-												Modify Token.morph to enable unsetting (#6043)

Modify `Token.morph` property so that `Token.c.morph` can be reset back
to an internal value of `0`. Allow setting `Token.morph` from a hash as
long as the morph string is already in the `StringStore`, setting it
indirectly through `Token.morph_` so that the value is added to the
morphology. If the hash is not in the `StringStore`, raise an error.
											
										
										
											2020-09-13 15:06:07 +03:00
+								    property morph:
 								        def __get__(self):
 								            return MorphAnalysis.from_id(self.vocab, self.c.morph)
-												Refactor Token morph setting (#6175)

* Refactor Token morph setting

* Remove `Token.morph_`
* Add `Token.set_morph()`
  * `0` resets `token.c.morph` to unset
  * Any other values are passed to `Morphology.add`

* Add token.morph setter to set from MorphAnalysis
											
										
										
											2020-10-01 23:21:46 +03:00
+								        def __set__(self, MorphAnalysis morph):
 								            # Check that the morph has the same vocab
 								            if self.vocab != morph.vocab:
 								                raise ValueError(Errors.E1013)
 								            self.c.morph = morph.c.key
 								    def set_morph(self, features):
 								        cdef hash_t key
-												Switch reset value for set_morph to None

											
										
										
											2020-10-02 09:25:15 +03:00
+								        if features is None:
-												Refactor Token morph setting (#6175)

* Refactor Token morph setting

* Remove `Token.morph_`
* Add `Token.set_morph()`
  * `0` resets `token.c.morph` to unset
  * Any other values are passed to `Morphology.add`

* Add token.morph setter to set from MorphAnalysis
											
										
										
											2020-10-01 23:21:46 +03:00
+								            self.c.morph = 0
-												Also accept MorphAnalysis in set_morph

											
										
										
											2020-10-02 09:33:43 +03:00
+								        elif isinstance(features, MorphAnalysis):
 								            self.morph = features
-												Refactor Token morph setting (#6175)

* Refactor Token morph setting

* Remove `Token.morph_`
* Add `Token.set_morph()`
  * `0` resets `token.c.morph` to unset
  * Any other values are passed to `Morphology.add`

* Add token.morph setter to set from MorphAnalysis
											
										
										
											2020-10-01 23:21:46 +03:00
+								        else:
 								            if isinstance(features, int):
 								                features = self.vocab.strings[features]
 								            key = self.vocab.morphology.add(features)
-												Modify morphology to support arbitrary features (#4932)

* Restructure tag maps for MorphAnalysis changes

Prepare tag maps for upcoming MorphAnalysis changes that allow
arbritrary features.

* Use default tag map rather than duplicating for ca / uk / vi

* Import tag map into defaults for ga

* Modify tag maps so all morphological fields and features are strings
  * Move features from `"Other"` to the top level
  * Rewrite tuples as strings separated by `","`

* Rewrite morph symbols for fr lemmatizer as strings

* Export MorphAnalysis under spacy.tokens

* Modify morphology to support arbitrary features

Modify `Morphology` and `MorphAnalysis` so that arbitrary features are
supported.

* Modify `MorphAnalysisC` so that it can support arbitrary features and
multiple values per field. `MorphAnalysisC` is redesigned to contain:
  * key: hash of UD FEATS string of morphological features
  * array of `MorphFeatureC` structs that each contain a hash of `Field`
and `Field=Value` for a given morphological feature, which makes it
possible to:
    * find features by field
    * represent multiple values for a given field

* `get_field()` is renamed to `get_by_field()` and is no longer `nogil`.
Instead a new helper function `get_n_by_field()` is `nogil` and returns
`n` features by field.

* `MorphAnalysis.get()` returns all possible values for a field as a
list of individual features such as `["Tense=Pres", "Tense=Past"]`.

* `MorphAnalysis`'s `str()` and `repr()` are the UD FEATS string.

* `Morphology.feats_to_dict()` converts a UD FEATS string to a dict
where:
  * Each field has one entry in the dict
  * Multiple values remain separated by a separator in the value string

* `Token.morph_` returns the UD FEATS string and you can set
`Token.morph_` with a UD FEATS string or with a tag map dict.

* Modify get_by_field to use np.ndarray

Modify `get_by_field()` to use np.ndarray. Remove `max_results` from
`get_n_by_field()` and always iterate over all the fields.

* Rewrite without MorphFeatureC

* Add shortcut for existing feats strings as keys

Add shortcut for existing feats strings as keys in `Morphology.add()`.

* Check for '_' as empty analysis when adding morphs

* Extend helper converters in Morphology

Add and extend helper converters that convert and normalize between:

* UD FEATS strings (`"Case=dat,gen|Number=sing"`)
* per-field dict of feats (`{"Case": "dat,gen", "Number": "sing"}`)
* list of individual features (`["Case=dat", "Case=gen",
"Number=sing"]`)

All converters sort fields and values where applicable.

											
										
										
											2020-01-24 00:01:54 +03:00
+								            self.c.morph = key
-												Add Token.lex

											
										
										
											2020-08-10 17:43:52 +03:00
+								    @property
 								    def lex(self):
 								        """RETURNS (Lexeme): The underlying lexeme."""
 								        return self.vocab[self.c.lex.orth]
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def lex_id(self):
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (int): Sequential ID of the token's lexical type."""
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return self.c.lex.id
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def rank(self):
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (int): Sequential ID of the token's lexical type, used to
 								        index into tables, e.g. for word vectors."""
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return self.c.lex.id
-												* Add .rank property to Token and Lexeme, for frequency rank

											
										
										
											2015-11-08 18:18:25 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def text(self):
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        """RETURNS (str): The original verbatim text of the token."""
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return self.orth_
-												* Add test and test_with_ws attributes.

											
										
										
											2015-09-13 03:27:42 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def text_with_ws(self):
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        """RETURNS (str): The text content of the span (with trailing
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								            whitespace).
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        cdef unicode orth = self.vocab.strings[self.c.lex.orth]
 								        if self.c.spacy:
 								            return orth + " "
 								        else:
 								            return orth
-												* Add test and test_with_ws attributes.

											
										
										
											2015-09-13 03:27:42 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def prob(self):
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (float): Smoothed log probability estimate of token type."""
-												Reduce stored lexemes data, move feats to lookups (#5238)

* Reduce stored lexemes data, move feats to lookups

* Move non-derivable lexemes features (`norm / cluster / prob`) to
`spacy-lookups-data` as lookups
  * Get/set `norm` in both lookups and `LexemeC`, serialize in lookups
  * Remove `cluster` and `prob` from `LexemesC`, get/set/serialize in
    lookups only
* Remove serialization of lexemes data as `vocab/lexemes.bin`
  * Remove `SerializedLexemeC`
  * Remove `Lexeme.to_bytes/from_bytes`
* Modify normalization exception loading:
  * Always create `Vocab.lookups` table `lexeme_norm` for
    normalization exceptions
  * Load base exceptions from `lang.norm_exceptions`, but load
    language-specific exceptions from lookups
  * Set `lex_attr_getter[NORM]` including new lookups table in
    `BaseDefaults.create_vocab()` and when deserializing `Vocab`
* Remove all cached lexemes when deserializing vocab to override
  existing normalizations with the new normalizations (as a replacement
  for the previous step that replaced all lexemes data with the
  deserialized data)

* Skip English normalization test

Skip English normalization test because the data is now in
`spacy-lookups-data`.

* Remove norm exceptions

Moved to spacy-lookups-data.

* Move norm exceptions test to spacy-lookups-data

* Load extra lookups from spacy-lookups-data lazily

Load extra lookups (currently for cluster and prob) lazily from the
entry point `lg_extra` as `Vocab.lookups_extra`.

* Skip creating lexeme cache on load

To improve model loading times, do not create the full lexeme cache when
loading. The lexemes will be created on demand when processing.

* Identify numeric values in Lexeme.set_attrs()

With the removal of a special case for `PROB`, also identify `float` to
avoid trying to convert it with the `StringStore`.

* Skip lexeme cache init in from_bytes

* Unskip and update lookups tests for python3.6+

* Update vocab pickle to include lookups_extra

* Update vocab serialization tests

Check strings rather than lexemes since lexemes aren't initialized
automatically, account for addition of "_SP".

* Re-skip lookups test because of python3.5

* Skip PROB/float values in Lexeme.set_attrs

* Convert is_oov from lexeme flag to lex in vectors

Instead of storing `is_oov` as a lexeme flag, `is_oov` reports whether
the lexeme has a vector.

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-05-19 16:59:14 +03:00
+								        return self.vocab[self.c.lex.orth].prob
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def sentiment(self):
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (float): A scalar value indicating the positivity or
 								            negativity of the token."""
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        if "sentiment" in self.doc.user_token_hooks:
 								            return self.doc.user_token_hooks["sentiment"](self)
-												Reduce stored lexemes data, move feats to lookups (#5238)

* Reduce stored lexemes data, move feats to lookups

* Move non-derivable lexemes features (`norm / cluster / prob`) to
`spacy-lookups-data` as lookups
  * Get/set `norm` in both lookups and `LexemeC`, serialize in lookups
  * Remove `cluster` and `prob` from `LexemesC`, get/set/serialize in
    lookups only
* Remove serialization of lexemes data as `vocab/lexemes.bin`
  * Remove `SerializedLexemeC`
  * Remove `Lexeme.to_bytes/from_bytes`
* Modify normalization exception loading:
  * Always create `Vocab.lookups` table `lexeme_norm` for
    normalization exceptions
  * Load base exceptions from `lang.norm_exceptions`, but load
    language-specific exceptions from lookups
  * Set `lex_attr_getter[NORM]` including new lookups table in
    `BaseDefaults.create_vocab()` and when deserializing `Vocab`
* Remove all cached lexemes when deserializing vocab to override
  existing normalizations with the new normalizations (as a replacement
  for the previous step that replaced all lexemes data with the
  deserialized data)

* Skip English normalization test

Skip English normalization test because the data is now in
`spacy-lookups-data`.

* Remove norm exceptions

Moved to spacy-lookups-data.

* Move norm exceptions test to spacy-lookups-data

* Load extra lookups from spacy-lookups-data lazily

Load extra lookups (currently for cluster and prob) lazily from the
entry point `lg_extra` as `Vocab.lookups_extra`.

* Skip creating lexeme cache on load

To improve model loading times, do not create the full lexeme cache when
loading. The lexemes will be created on demand when processing.

* Identify numeric values in Lexeme.set_attrs()

With the removal of a special case for `PROB`, also identify `float` to
avoid trying to convert it with the `StringStore`.

* Skip lexeme cache init in from_bytes

* Unskip and update lookups tests for python3.6+

* Update vocab pickle to include lookups_extra

* Update vocab serialization tests

Check strings rather than lexemes since lexemes aren't initialized
automatically, account for addition of "_SP".

* Re-skip lookups test because of python3.5

* Skip PROB/float values in Lexeme.set_attrs

* Convert is_oov from lexeme flag to lex in vectors

Instead of storing `is_oov` as a lexeme flag, `is_oov` reports whether
the lexeme has a vector.

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-05-19 16:59:14 +03:00
+								        return self.vocab[self.c.lex.orth].sentiment
-												Add sentiment field to doc, rename getters_for_tokens and getters_for_spans, add user_hooks field to Doc.

											
										
										
											2016-10-19 21:54:03 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def lang(self):
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (uint64): ID of the language of the parent document's
 								            vocabulary.
 								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return self.c.lex.lang
-												introduce lang field for LexemeC to hold language id
put noun_chunk logic into iterators.py for each language separately

											
										
										
											2016-03-10 15:01:34 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def idx(self):
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (int): The character offset of the token within the parent
 								            document.
 								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return self.c.idx
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def cluster(self):
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (int): Brown cluster ID."""
-												Reduce stored lexemes data, move feats to lookups (#5238)

* Reduce stored lexemes data, move feats to lookups

* Move non-derivable lexemes features (`norm / cluster / prob`) to
`spacy-lookups-data` as lookups
  * Get/set `norm` in both lookups and `LexemeC`, serialize in lookups
  * Remove `cluster` and `prob` from `LexemesC`, get/set/serialize in
    lookups only
* Remove serialization of lexemes data as `vocab/lexemes.bin`
  * Remove `SerializedLexemeC`
  * Remove `Lexeme.to_bytes/from_bytes`
* Modify normalization exception loading:
  * Always create `Vocab.lookups` table `lexeme_norm` for
    normalization exceptions
  * Load base exceptions from `lang.norm_exceptions`, but load
    language-specific exceptions from lookups
  * Set `lex_attr_getter[NORM]` including new lookups table in
    `BaseDefaults.create_vocab()` and when deserializing `Vocab`
* Remove all cached lexemes when deserializing vocab to override
  existing normalizations with the new normalizations (as a replacement
  for the previous step that replaced all lexemes data with the
  deserialized data)

* Skip English normalization test

Skip English normalization test because the data is now in
`spacy-lookups-data`.

* Remove norm exceptions

Moved to spacy-lookups-data.

* Move norm exceptions test to spacy-lookups-data

* Load extra lookups from spacy-lookups-data lazily

Load extra lookups (currently for cluster and prob) lazily from the
entry point `lg_extra` as `Vocab.lookups_extra`.

* Skip creating lexeme cache on load

To improve model loading times, do not create the full lexeme cache when
loading. The lexemes will be created on demand when processing.

* Identify numeric values in Lexeme.set_attrs()

With the removal of a special case for `PROB`, also identify `float` to
avoid trying to convert it with the `StringStore`.

* Skip lexeme cache init in from_bytes

* Unskip and update lookups tests for python3.6+

* Update vocab pickle to include lookups_extra

* Update vocab serialization tests

Check strings rather than lexemes since lexemes aren't initialized
automatically, account for addition of "_SP".

* Re-skip lookups test because of python3.5

* Skip PROB/float values in Lexeme.set_attrs

* Convert is_oov from lexeme flag to lex in vectors

Instead of storing `is_oov` as a lexeme flag, `is_oov` reports whether
the lexeme has a vector.

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-05-19 16:59:14 +03:00
+								        return self.vocab[self.c.lex.orth].cluster
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def orth(self):
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (uint64): ID of the verbatim text content."""
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return self.c.lex.orth
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def lower(self):
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (uint64): ID of the lowercase token text."""
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return self.c.lex.lower
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def norm(self):
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (uint64): ID of the token's norm, i.e. a normalised form of
 								            the token text. Usually set in the language's tokenizer exceptions
 								            or norm exceptions.
 								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        if self.c.norm == 0:
 								            return self.c.lex.norm
 								        else:
 								            return self.c.norm
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def shape(self):
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (uint64): ID of the token's shape, a transform of the
-												fix 's typo's across code base (#8384)


											
										
										
											2021-06-15 11:57:08 +03:00
+								            token's string, to show orthographic features (e.g. "Xxxx", "dd").
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return self.c.lex.shape
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def prefix(self):
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (uint64): ID of a length-N substring from the start of the
 								            token. Defaults to `N=1`.
 								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return self.c.lex.prefix
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def suffix(self):
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (uint64): ID of a length-N substring from the end of the
 								            token. Defaults to `N=3`.
 								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return self.c.lex.suffix
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property lemma:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (uint64): ID of the base form of the word, with no
 								            inflectional suffixes.
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												Add Lemmatizer and simplify related components (#5848)

* Add Lemmatizer and simplify related components

* Add `Lemmatizer` pipe with `lookup` and `rule` modes using the
`Lookups` tables.
* Reduce `Tagger` to a simple tagger that sets `Token.tag` (no pos or lemma)
* Reduce `Morphology` to only keep track of morph tags (no tag map, lemmatizer,
or morph rules)
* Remove lemmatizer from `Vocab`
* Adjust many many tests

Differences:

* No default lookup lemmas
* No special treatment of TAG in `from_array` and similar required
* Easier to modify labels in a `Tagger`
* No extra strings added from morphology / tag map

* Fix test

* Initial fix for Lemmatizer config/serialization

* Adjust init test to be more generic

* Adjust init test to force empty Lookups

* Add simple cache to rule-based lemmatizer

* Convert language-specific lemmatizers

Convert language-specific lemmatizers to component lemmatizers. Remove
previous lemmatizer class.

* Fix French and Polish lemmatizers

* Remove outdated UPOS conversions

* Update Russian lemmatizer init in tests

* Add minimal init/run tests for custom lemmatizers

* Add option to overwrite existing lemmas

* Update mode setting, lookup loading, and caching

* Make `mode` an immutable property
* Only enforce strict `load_lookups` for known supported modes
* Move caching into individual `_lemmatize` methods

* Implement strict when lang is not found in lookups

* Fix tables/lookups in make_lemmatizer

* Reallow provided lookups and allow for stricter checks

* Add lookups asset to all Lemmatizer pipe tests

* Rename lookups in lemmatizer init test

* Clean up merge

* Refactor lookup table loading

* Add helper from `load_lemmatizer_lookups` that loads required and
optional lookups tables based on settings provided by a config.

Additional slight refactor of lookups:

* Add `Lookups.set_table` to set a table from a provided `Table`
* Reorder class definitions to be able to specify type as `Table`

* Move registry assets into test methods

* Refactor lookups tables config

Use class methods within `Lemmatizer` to provide the config for
particular modes and to load the lookups from a config.

* Add pipe and score to lemmatizer

* Simplify Tagger.score

* Add missing import

* Clean up imports and auto-format

* Remove unused kwarg

* Tidy up and auto-format

* Update docstrings for Lemmatizer

Update docstrings for Lemmatizer.

Additionally modify `is_base_form` API to take `Token` instead of
individual features.

* Update docstrings

* Remove tag map values from Tagger.add_label

* Update API docs

* Fix relative link in Lemmatizer API docs
											
										
										
											2020-08-07 16:27:13 +03:00
+								            return self.c.lemma
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
-												Finish stringstore change. Also xfail vectors tests

											
										
										
											2017-05-28 16:10:22 +03:00
+								        def __set__(self, attr_t lemma):
-												Allow lemma to be set from Python. Re #973

											
										
										
											2017-04-16 19:07:53 +03:00
+								            self.c.lemma = lemma
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property pos:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (uint64): ID of coarse-grained part-of-speech tag."""
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
 								            return self.c.pos
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
-												Fix #2014: token.pos_ not writeable

											
										
										
											2018-03-27 22:21:11 +03:00
+								        def __set__(self, pos):
 								            self.c.pos = pos
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property tag:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (uint64): ID of fine-grained part-of-speech tag."""
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
 								            return self.c.tag
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
-												Finish stringstore change. Also xfail vectors tests

											
										
										
											2017-05-28 16:10:22 +03:00
+								        def __set__(self, attr_t tag):
-												Add Lemmatizer and simplify related components (#5848)

* Add Lemmatizer and simplify related components

* Add `Lemmatizer` pipe with `lookup` and `rule` modes using the
`Lookups` tables.
* Reduce `Tagger` to a simple tagger that sets `Token.tag` (no pos or lemma)
* Reduce `Morphology` to only keep track of morph tags (no tag map, lemmatizer,
or morph rules)
* Remove lemmatizer from `Vocab`
* Adjust many many tests

Differences:

* No default lookup lemmas
* No special treatment of TAG in `from_array` and similar required
* Easier to modify labels in a `Tagger`
* No extra strings added from morphology / tag map

* Fix test

* Initial fix for Lemmatizer config/serialization

* Adjust init test to be more generic

* Adjust init test to force empty Lookups

* Add simple cache to rule-based lemmatizer

* Convert language-specific lemmatizers

Convert language-specific lemmatizers to component lemmatizers. Remove
previous lemmatizer class.

* Fix French and Polish lemmatizers

* Remove outdated UPOS conversions

* Update Russian lemmatizer init in tests

* Add minimal init/run tests for custom lemmatizers

* Add option to overwrite existing lemmas

* Update mode setting, lookup loading, and caching

* Make `mode` an immutable property
* Only enforce strict `load_lookups` for known supported modes
* Move caching into individual `_lemmatize` methods

* Implement strict when lang is not found in lookups

* Fix tables/lookups in make_lemmatizer

* Reallow provided lookups and allow for stricter checks

* Add lookups asset to all Lemmatizer pipe tests

* Rename lookups in lemmatizer init test

* Clean up merge

* Refactor lookup table loading

* Add helper from `load_lemmatizer_lookups` that loads required and
optional lookups tables based on settings provided by a config.

Additional slight refactor of lookups:

* Add `Lookups.set_table` to set a table from a provided `Table`
* Reorder class definitions to be able to specify type as `Table`

* Move registry assets into test methods

* Refactor lookups tables config

Use class methods within `Lemmatizer` to provide the config for
particular modes and to load the lookups from a config.

* Add pipe and score to lemmatizer

* Simplify Tagger.score

* Add missing import

* Clean up imports and auto-format

* Remove unused kwarg

* Tidy up and auto-format

* Update docstrings for Lemmatizer

Update docstrings for Lemmatizer.

Additionally modify `is_base_form` API to take `Token` instead of
individual features.

* Update docstrings

* Remove tag map values from Tagger.add_label

* Update API docs

* Fix relative link in Lemmatizer API docs
											
										
										
											2020-08-07 16:27:13 +03:00
+								            self.c.tag = tag
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property dep:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (uint64): ID of syntactic dependency label."""
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
 								            return self.c.dep
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
-												Finish stringstore change. Also xfail vectors tests

											
										
										
											2017-05-28 16:10:22 +03:00
+								        def __set__(self, attr_t label):
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								            self.c.dep = label
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def has_vector(self):
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """A boolean value indicating whether a word vector is associated with
 								        the object.
 								        RETURNS (bool): Whether a word vector is associated with the object.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/token#has_vector
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        if "has_vector" in self.doc.user_token_hooks:
 								            return self.doc.user_token_hooks["has_vector"](self)
 								        if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
 								            return True
 								        return self.vocab.has_vector(self.c.lex.orth)
-												* Add has_vector attribute to Token and Lexeme

											
										
										
											2015-09-21 12:52:43 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def vector(self):
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """A real-valued meaning representation.
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
 								            representing the token's semantics.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/token#vector
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        if "vector" in self.doc.user_token_hooks:
 								            return self.doc.user_token_hooks["vector"](self)
 								        if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
 								            return self.doc.tensor[self.i]
 								        else:
 								            return self.vocab.get_vector(self.c.lex.orth)
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def vector_norm(self):
-												Update docstrings and API docs for Token

											
										
										
											2017-05-20 16:13:33 +03:00
+								        """The L2 norm of the token's vector representation.
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
 								        RETURNS (float): The L2 norm of the vector representation.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/token#vector_norm
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        if "vector_norm" in self.doc.user_token_hooks:
 								            return self.doc.user_token_hooks["vector_norm"](self)
 								        vector = self.vector
-												Fix similarity calculation if vectors are on GPU (#3440)


											
										
										
											2019-03-20 14:09:59 +03:00
+								        xp = get_array_module(vector)
 								        total = (vector ** 2).sum()
 								        return xp.sqrt(total) if total != 0. else 0.
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
-												Add span.tensor and token.tensor attributes

											
										
										
											2019-08-01 19:30:50 +03:00
+								    @property
 								    def tensor(self):
 								        if self.doc.tensor is None:
 								            return None
 								        return self.doc.tensor[self.i]
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def n_lefts(self):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        """The number of leftward immediate children of the word, in the
 								        syntactic dependency parse.
 								        RETURNS (int): The number of leftward immediate children of the
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								            word, in the syntactic dependency parse.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/token#n_lefts
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return self.c.l_kids
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def n_rights(self):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        """The number of rightward immediate children of the word, in the
 								        syntactic dependency parse.
 								        RETURNS (int): The number of rightward immediate children of the
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								            word, in the syntactic dependency parse.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/token#n_rights
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return self.c.r_kids
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def sent(self):
-												Add sent property to token (#2521)

* Add sent property to token

* Refactored and cleaned up copy paste errors.

											
										
										
											2018-07-06 16:54:15 +03:00
+								        """RETURNS (Span): The sentence span that the token is a part of."""
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        if 'sent' in self.doc.user_token_hooks:
 								            return self.doc.user_token_hooks["sent"](self)
 								        return self.doc[self.i : self.i+1].sent
-												Add sent property to token (#2521)

* Add sent property to token

* Refactored and cleaned up copy paste errors.

											
										
										
											2018-07-06 16:54:15 +03:00
-												* Add Token.sent_start property, re Issue #235

											
										
										
											2016-05-05 12:53:20 +03:00
+								    property sent_start:
 								        def __get__(self):
-												💫 Add better and serializable sentencizer (#3471)

* Add better serializable sentencizer component

* Replace default factory

* Add tests

* Tidy up

* Pass test

* Update docs

											
										
										
											2019-03-23 17:45:02 +03:00
+								            """Deprecated: use Token.is_sent_start instead."""
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								            # Raising a deprecation warning here causes errors for autocomplete
-												Add Token.is_sent_start property, so can deprecate Token.sent_start

											
										
										
											2017-11-01 15:27:14 +03:00
+								            # Handle broken backwards compatibility case: doc[0].sent_start
 								            # was False.
 								            if self.i == 0:
 								                return False
 								            else:
-												Fix infinite recursion in token.sent_start. Closes #1640

											
										
										
											2018-01-14 17:02:15 +03:00
+								                return self.c.sent_start
-												Add Token.is_sent_start property, so can deprecate Token.sent_start

											
										
										
											2017-11-01 15:27:14 +03:00
 								        def __set__(self, value):
 								            self.is_sent_start = value
 								    property is_sent_start:
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        """A boolean value indicating whether the token starts a sentence.
 								        `None` if unknown. Defaults to `True` for the first token in the `Doc`.
 								        RETURNS (bool / None): Whether the token starts a sentence.
-												Add Token.is_sent_start property, so can deprecate Token.sent_start

											
										
										
											2017-11-01 15:27:14 +03:00
+								            None if unknown.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/token#is_sent_start
-												Add Token.is_sent_start property, so can deprecate Token.sent_start

											
										
										
											2017-11-01 15:27:14 +03:00
+								        """
 								        def __get__(self):
 								            if self.c.sent_start == 0:
 								                return None
 								            elif self.c.sent_start < 0:
 								                return False
 								            else:
 								                return True
-												* Add Token.sent_start property, re Issue #235

											
										
										
											2016-05-05 12:53:20 +03:00
-												Add ternary value setting to Token.sent_start

											
										
										
											2017-10-09 00:51:58 +03:00
+								        def __set__(self, value):
-												Refactor Docs.is_ flags (#6044)

* Refactor Docs.is_ flags

* Add derived `Doc.has_annotation` method

  * `Doc.has_annotation(attr)` returns `True` for partial annotation

  * `Doc.has_annotation(attr, require_complete=True)` returns `True` for
    complete annotation

* Add deprecation warnings to `is_tagged`, `is_parsed`, `is_sentenced`
and `is_nered`

* Add `Doc._get_array_attrs()`, which returns a full list of `Doc` attrs
for use with `Doc.to_array`, `Doc.to_bytes` and `Doc.from_docs`. The
list is the `DocBin` attributes list plus `SPACY` and `LENGTH`.

Notes on `Doc.has_annotation`:

* `HEAD` is converted to `DEP` because heads don't have an unset state

* Accept `IS_SENT_START` as a synonym of `SENT_START`

Additional changes:

* Add `NORM`, `ENT_ID` and `SENT_START` to default attributes for
`DocBin`

* In `Doc.from_array()` the presence of `DEP` causes `HEAD` to override
`SENT_START`

* In `Doc.from_array()` using `attrs` other than
`Doc._get_array_attrs()` (i.e., a user's custom list rather than our
default internal list) with both `HEAD` and `SENT_START` shows a warning
that `HEAD` will override `SENT_START`

* `set_children_from_heads` does not require dependency labels to set
sentence boundaries and sets `sent_start` for all non-sentence starts to
`-1`

* Fix call to set_children_form_heads

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-09-17 01:14:01 +03:00
+								            if self.doc.has_annotation("DEP"):
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								                raise ValueError(Errors.E043)
-												Add ternary value setting to Token.sent_start

											
										
										
											2017-10-09 00:51:58 +03:00
+								            if value is None:
 								                self.c.sent_start = 0
 								            elif value is True:
 								                self.c.sent_start = 1
 								            elif value is False:
 								                self.c.sent_start = -1
 								            else:
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								                raise ValueError(Errors.E044.format(value=value))
-												* Add Token.sent_start property, re Issue #235

											
										
										
											2016-05-05 12:53:20 +03:00
-												Add is_sent_end token property (#5375)

Reconstruction of the original PR #4697 by @MiniLau.

Removes unused `SENT_END` symbol and `IS_SENT_END` from `Matcher` schema
because the Matcher is only going to be able to support `IS_SENT_START`.
											
										
										
											2020-04-29 13:53:16 +03:00
+								    property is_sent_end:
 								        """A boolean value indicating whether the token ends a sentence.
 								        `None` if unknown. Defaults to `True` for the last token in the `Doc`.
 								        RETURNS (bool / None): Whether the token ends a sentence.
 								            None if unknown.
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/token#is_sent_end
-												Add is_sent_end token property (#5375)

Reconstruction of the original PR #4697 by @MiniLau.

Removes unused `SENT_END` symbol and `IS_SENT_END` from `Matcher` schema
because the Matcher is only going to be able to support `IS_SENT_START`.
											
										
										
											2020-04-29 13:53:16 +03:00
+								        """
 								        def __get__(self):
 								            if self.i + 1 == len(self.doc):
 								                return True
 								            elif self.doc[self.i+1].is_sent_start == None:
 								                return None
 								            elif self.doc[self.i+1].is_sent_start == True:
 								                return True
 								            else:
 								                return False
 								        def __set__(self, value):
 								            raise ValueError(Errors.E196)
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def lefts(self):
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """The leftward immediate children of the word, in the syntactic
 								        dependency parse.
 								        YIELDS (Token): A left-child of the token.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/token#lefts
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        cdef int nr_iter = 0
 								        cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge)
 								        while ptr < self.c:
 								            if ptr + ptr.head == self.c:
 								                yield self.doc[ptr - (self.c - self.i)]
 								            ptr += 1
 								            nr_iter += 1
 								            # This is ugly, but it's a way to guard out infinite loops
 								            if nr_iter >= 10000000:
 								                raise RuntimeError(Errors.E045.format(attr="token.lefts"))
 								    @property
 								    def rights(self):
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """The rightward immediate children of the word, in the syntactic
 								        dependency parse.
 								        YIELDS (Token): A right-child of the token.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/token#rights
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
 								        tokens = []
 								        cdef int nr_iter = 0
 								        while ptr > self.c:
 								            if ptr + ptr.head == self.c:
 								                tokens.append(self.doc[ptr - (self.c - self.i)])
 								            ptr -= 1
 								            nr_iter += 1
 								            if nr_iter >= 10000000:
 								                raise RuntimeError(Errors.E045.format(attr="token.rights"))
 								        tokens.reverse()
 								        for t in tokens:
 								            yield t
 								    @property
 								    def children(self):
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """A sequence of the token's immediate syntactic children.
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        YIELDS (Token): A child token such that `child.head==self`.
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/token#children
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        yield from self.lefts
 								        yield from self.rights
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def subtree(self):
-												Correct docs of `Token.subtree` and `Span.subtree` (issue #3122) (#3124)

* solve inconsistency between docs and Span.subtree (issue #3122)

* solve inconsistency between docs and Token.subtree (issue #3122)

											
										
										
											2019-01-09 05:11:15 +03:00
+								        """A sequence containing the token and all the token's syntactic
 								        descendants.
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        YIELDS (Token): A descendent token such that
-												Correct docs of `Token.subtree` and `Span.subtree` (issue #3122) (#3124)

* solve inconsistency between docs and Span.subtree (issue #3122)

* solve inconsistency between docs and Token.subtree (issue #3122)

											
										
										
											2019-01-09 05:11:15 +03:00
+								            `self.is_ancestor(descendent) or token == self`.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/token#subtree
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        for word in self.lefts:
 								            yield from word.subtree
 								        yield self
 								        for word in self.rights:
 								            yield from word.subtree
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								    def left_edge(self) -> int:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """The leftmost token of this token's syntactic descendents.
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        RETURNS (Token): The first token such that `self.is_ancestor(token)`.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return self.doc[self.c.l_edge]
-												* Whitespace

											
										
										
											2015-08-09 00:37:44 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
-												🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)

* 🚨 Ignore all existing Mypy errors

* 🏗 Add Mypy check to CI

* Add types-mock and types-requests as dev requirements

* Add additional type ignore directives

* Add types packages to dev-only list in reqs test

* Add types-dataclasses for python 3.6

* Add ignore to pretrain

* 🏷 Improve type annotation on `run_command` helper

The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.

* 🔧 Allow variable type redefinition in limited contexts

These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:

```python
def process(items: List[str]) -> None:
    # 'items' has type List[str]
    items = [item.split() for item in items]
    # 'items' now has type List[List[str]]
    ...
```

This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`

These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.

* 🏷 Add type annotation to converters mapping

* 🚨 Fix Mypy error in convert CLI argument verification

* 🏷 Improve type annotation on `resolve_dot_names` helper

* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`

* 🏷 Add type annotations for more `Vocab` attributes

* 🏷 Add loose type annotation for gold data compilation

* 🏷 Improve `_format_labels` type annotation

* 🏷 Fix `get_lang_class` type annotation

* 🏷 Loosen return type of `Language.evaluate`

* 🏷 Don't accept `Scorer` in `handle_scores_per_type`

* 🏷 Add `string_to_list` overloads

* 🏷 Fix non-Optional command-line options

* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`

* ➕ Install `typing_extensions` in Python 3.8+

The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.

Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.

These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.

* 🏷 Improve type annotation for `Language.pipe`

These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.

Fixes #8772

* ➖ Don't install `typing-extensions` in Python 3.8+

After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉

These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.

* resolve mypy errors for Strict pydantic types

* refactor code to avoid missing return statement

* fix types of convert CLI command

* avoid list-set confustion in debug_data

* fix typo and formatting

* small fixes to avoid type ignores

* fix types in profile CLI command and make it more efficient

* type fixes in projects CLI

* put one ignore back

* type fixes for render

* fix render types - the sequel

* fix BaseDefault in language definitions

* fix type of noun_chunks iterator - yields tuple instead of span

* fix types in language-specific modules

* 🏷 Expand accepted inputs of `get_string_id`

`get_string_id` accepts either a string (in which case it returns its 
ID) or an ID (in which case it immediately returns the ID). These 
changes extend the type annotation of `get_string_id` to indicate that 
it can accept either strings or IDs.

* 🏷 Handle override types in `combine_score_weights`

The `combine_score_weights` function allows users to pass an `overrides` 
mapping to override data extracted from the `weights` argument. Since it 
allows `Optional` dictionary values, the return value may also include 
`Optional` dictionary values.

These changes update the type annotations for `combine_score_weights` to 
reflect this fact.

* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`

* 🏷 Fix redefinition of `wandb_logger`

These changes fix the redefinition of `wandb_logger` by giving a 
separate name to each `WandbLogger` version. For 
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3` 
as `wandb_logger` for now.

* more fixes for typing in language

* type fixes in model definitions

* 🏷 Annotate `_RandomWords.probs` as `NDArray`

* 🏷 Annotate `tok2vec` layers to help Mypy

* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6

Also remove an import that I forgot to move to the top of the module 😅

* more fixes for matchers and other pipeline components

* quick fix for entity linker

* fixing types for spancat, textcat, etc

* bugfix for tok2vec

* type annotations for scorer

* add runtime_checkable for Protocol

* type and import fixes in tests

* mypy fixes for training utilities

* few fixes in util

* fix import

* 🐵 Remove unused `# type: ignore` directives

* 🏷 Annotate `Language._components`

* 🏷 Annotate `spacy.pipeline.Pipe`

* add doc as property to span.pyi

* small fixes and cleanup

* explicit type annotations instead of via comment

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
											
										
										
											2021-10-14 16:21:40 +03:00
+								    def right_edge(self) -> int:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """The rightmost token of this token's syntactic descendents.
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        RETURNS (Token): The last token such that `self.is_ancestor(token)`.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return self.doc[self.c.r_edge]
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def ancestors(self):
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """A sequence of this token's syntactic ancestors.
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        YIELDS (Token): A sequence of ancestor tokens such that
 								            `ancestor.is_ancestor(self)`.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/token#ancestors
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        cdef const TokenC* head_ptr = self.c
 								        # Guard against infinite loop, no token can have
 								        # more ancestors than tokens in the tree.
 								        cdef int i = 0
 								        while head_ptr.head != 0 and i < self.doc.length:
 								            head_ptr += head_ptr.head
 								            yield self.doc[head_ptr - (self.c - self.i)]
 								            i += 1
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
+								    def is_ancestor(self, descendant):
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """Check whether this token is a parent, grandparent, etc. of another
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
+								        in the dependency tree.
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        descendant (Token): Another token.
 								        RETURNS (bool): Whether this token is the ancestor of the descendant.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/token#is_ancestor
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												Fix variable error in token

											
										
										
											2016-11-01 15:28:00 +03:00
+								        if self.doc is not descendant.doc:
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
+								            return False
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        return any(ancestor.i == self.i for ancestor in descendant.ancestors)
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
-												introduce token.has_head and refer to MISSING_DEP_ (WIP)

											
										
										
											2021-01-12 19:17:06 +03:00
+								    def has_head(self):
 								        """Check whether the token has annotated head information.
-												cleanup

											
										
										
											2021-01-13 16:20:05 +03:00
+								        Return False when the head annotation is unset/missing.
-												introduce token.has_head and refer to MISSING_DEP_ (WIP)

											
										
										
											2021-01-12 19:17:06 +03:00
 								        RETURNS (bool): Whether the head annotation is valid or not.
 								        """
-												cleanup

											
										
										
											2021-01-13 16:20:05 +03:00
+								        return not Token.missing_head(self.c)
-												introduce token.has_head and refer to MISSING_DEP_ (WIP)

											
										
										
											2021-01-12 19:17:06 +03:00
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								    property head:
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        """The syntactic parent, or "governor", of this token.
 								        If token.has_head() is `False`, this method will return itself.
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        RETURNS (Token): The token predicted by the parser to be the head of
-												introduce token.has_head and refer to MISSING_DEP_ (WIP)

											
										
										
											2021-01-12 19:17:06 +03:00
+								            the current token.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												introduce token.has_head and refer to MISSING_DEP_ (WIP)

											
										
										
											2021-01-12 19:17:06 +03:00
+								            if not self.has_head():
 								                return self
 								            else:
 								                return self.doc[self.i + self.c.head]
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								        def __set__(self, Token new_head):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								            # This function sets the head of self to new_head and updates the
 								            # counters for left/right dependents and left/right corner for the
 								            # new and the old head
-												Improve token head verification (#5079)

* Improve token head verification

Improve the verification for valid token heads when heads are set:

* in `Token.head`: heads come from the same document
* in `Doc.from_array()`: head indices are within the bounds of the
document

* Improve error message

											
										
										
											2020-03-03 23:44:51 +03:00
+								            # Check that token is from the same document
 								            if self.doc != new_head.doc:
 								                raise ValueError(Errors.E191)
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								            # Do nothing if old head is new head
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								            if self.i + self.c.head == new_head.i:
 								                return
-												Clean up spacy.tokens (#6046)

* Clean up spacy.tokens

* Update `set_children_from_heads`:
  * Don't check `dep` when setting lr_* or sentence starts
  * Set all non-sentence starts to `False`

* Use `set_children_from_heads` in `Token.head` setter
  * Reduce similar/duplicate code (admittedly adds a bit of overhead)
  * Update sentence starts consistently

* Remove unused `Doc.set_parse`

* Minor changes:
  * Declare cython variables (to avoid cython warnings)
  * Clean up imports

* Modify set_children_from_heads to set token range

Modify `set_children_from_heads` so that it adjust tokens within a
specified range rather then the whole document.

Modify the `Token.head` setter to adjust only the tokens affected by the
new head assignment.
											
										
										
											2020-09-16 21:32:38 +03:00
+								            # Find the widest l/r_edges of the roots of the two tokens involved
 								            # to limit the number of tokens for set_children_from_heads
 								            cdef Token self_root, new_head_root
-												DependencyMatcher improvements (fix #6678) (#6744)

* Adding contributor agreement for user werew

* [DependencyMatcher] Comment and clean code

* [DependencyMatcher] Use defaultdicts

* [DependencyMatcher] Simplify _retrieve_tree method

* [DependencyMatcher] Remove prepended underscores

* [DependencyMatcher] Address TODO and move grouping of token's positions out of the loop

* [DependencyMatcher] Remove _nodes attribute

* [DependencyMatcher] Use enumerate in _retrieve_tree method

* [DependencyMatcher] Clean unused vars and use camel_case naming

* [DependencyMatcher] Memoize node+operator map

* Add root property to Token

* [DependencyMatcher] Groups matches by root

* [DependencyMatcher] Remove unused _keys_to_token attribute

* [DependencyMatcher] Use a list to map tokens to matcher's keys

* [DependencyMatcher] Remove recursion

* [DependencyMatcher] Use a generator to retrieve matches

* [DependencyMatcher] Remove unused memory pool

* [DependencyMatcher] Hide private methods and attributes

* [DependencyMatcher] Improvements to the matches validation

* Apply suggestions from code review

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>

* [DependencyMatcher] Fix keys_to_position_maps

* Remove Token.root property

* [DependencyMatcher] Remove functools' lru_cache

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2021-01-22 03:20:08 +03:00
+								            self_root = ([self] + list(self.ancestors))[-1]
-												Clean up spacy.tokens (#6046)

* Clean up spacy.tokens

* Update `set_children_from_heads`:
  * Don't check `dep` when setting lr_* or sentence starts
  * Set all non-sentence starts to `False`

* Use `set_children_from_heads` in `Token.head` setter
  * Reduce similar/duplicate code (admittedly adds a bit of overhead)
  * Update sentence starts consistently

* Remove unused `Doc.set_parse`

* Minor changes:
  * Declare cython variables (to avoid cython warnings)
  * Clean up imports

* Modify set_children_from_heads to set token range

Modify `set_children_from_heads` so that it adjust tokens within a
specified range rather then the whole document.

Modify the `Token.head` setter to adjust only the tokens affected by the
new head assignment.
											
										
										
											2020-09-16 21:32:38 +03:00
+								            new_head_ancestors = list(new_head.ancestors)
 								            new_head_root = new_head_ancestors[-1] if new_head_ancestors else new_head
 								            start = self_root.c.l_edge if self_root.c.l_edge < new_head_root.c.l_edge else new_head_root.c.l_edge
 								            end = self_root.c.r_edge if self_root.c.r_edge > new_head_root.c.r_edge else new_head_root.c.r_edge
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								            # Set new head
-												Clean up spacy.tokens (#6046)

* Clean up spacy.tokens

* Update `set_children_from_heads`:
  * Don't check `dep` when setting lr_* or sentence starts
  * Set all non-sentence starts to `False`

* Use `set_children_from_heads` in `Token.head` setter
  * Reduce similar/duplicate code (admittedly adds a bit of overhead)
  * Update sentence starts consistently

* Remove unused `Doc.set_parse`

* Minor changes:
  * Declare cython variables (to avoid cython warnings)
  * Clean up imports

* Modify set_children_from_heads to set token range

Modify `set_children_from_heads` so that it adjust tokens within a
specified range rather then the whole document.

Modify the `Token.head` setter to adjust only the tokens affected by the
new head assignment.
											
										
										
											2020-09-16 21:32:38 +03:00
+								            self.c.head = new_head.i - self.i
 								            # Adjust parse properties and sentence starts
 								            set_children_from_heads(self.doc.c, start, end + 1)
-												* Whitespace

											
										
										
											2015-08-09 00:37:44 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def conjuncts(self):
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """A sequence of coordinated tokens, including the token itself.
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
-												Fix token.conjuncts (closes #795) (#3392)

* Implement conjuncts method

* Add span.conjuncts property

* Un-xfail token.conjuncts tests

* Update docs for token.conjuncts and span.conjuncts

* Fix merge error in token.conjuncts

											
										
										
											2019-03-11 19:05:45 +03:00
+								        RETURNS (tuple): The coordinated tokens.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								        DOCS: https://spacy.io/api/token#conjuncts
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												Fix token.conjuncts (closes #795) (#3392)

* Implement conjuncts method

* Add span.conjuncts property

* Un-xfail token.conjuncts tests

* Update docs for token.conjuncts and span.conjuncts

* Fix merge error in token.conjuncts

											
										
										
											2019-03-11 19:05:45 +03:00
+								        cdef Token word, child
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        if "conjuncts" in self.doc.user_token_hooks:
-												Fix token.conjuncts (closes #795) (#3392)

* Implement conjuncts method

* Add span.conjuncts property

* Un-xfail token.conjuncts tests

* Update docs for token.conjuncts and span.conjuncts

* Fix merge error in token.conjuncts

											
										
										
											2019-03-11 19:05:45 +03:00
+								            return tuple(self.doc.user_token_hooks["conjuncts"](self))
 								        start = self
 								        while start.i != start.head.i:
 								            if start.dep == conj:
 								                start = start.head
-												Defer some attributes to Doc, via getters_for_tokens attribute.

											
										
										
											2016-10-17 03:44:49 +03:00
+								            else:
-												Fix token.conjuncts (closes #795) (#3392)

* Implement conjuncts method

* Add span.conjuncts property

* Un-xfail token.conjuncts tests

* Update docs for token.conjuncts and span.conjuncts

* Fix merge error in token.conjuncts

											
										
										
											2019-03-11 19:05:45 +03:00
+								                break
 								        queue = [start]
 								        output = [start]
 								        for word in queue:
 								            for child in word.rights:
 								                if child.c.dep == conj:
 								                    output.append(child)
 								                    queue.append(child)
 								        return tuple([w for w in output if w.i != self.i])
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property ent_type:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (uint64): Named entity type."""
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
 								            return self.c.ent_type
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
-												Finish stringstore change. Also xfail vectors tests

											
										
										
											2017-05-28 16:10:22 +03:00
+								        def __set__(self, ent_type):
 								            self.c.ent_type = ent_type
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property ent_type_:
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        """RETURNS (str): Named entity type."""
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.

											
										
										
											2016-09-30 21:20:22 +03:00
+								            return self.vocab.strings[self.c.ent_type]
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
-												Finish stringstore change. Also xfail vectors tests

											
										
										
											2017-05-28 16:10:22 +03:00
+								        def __set__(self, ent_type):
 								            self.c.ent_type = self.vocab.strings.add(ent_type)
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def ent_iob(self):
 								        """IOB code of named entity tag. `1="I", 2="O", 3="B"`. 0 means no tag
 								        is assigned.
 								        RETURNS (uint64): IOB code of named entity tag.
 								        """
 								        return self.c.ent_iob
-												Improve spacy.gold (no GoldParse, no json format!) (#5555)

* Update errors

* Remove beam for now (maybe)

Remove beam_utils

Update setup.py

Remove beam

* Remove GoldParse

WIP on removing goldparse

Get ArcEager compiling after GoldParse excise

Update setup.py

Get spacy.syntax compiling after removing GoldParse

Rename NewExample -> Example and clean up

Clean html files

Start updating tests

Update Morphologizer

* fix error numbers

* fix merge conflict

* informative error when calling to_array with wrong field

* fix error catching

* fixing language and scoring tests

* start testing get_aligned

* additional tests for new get_aligned function

* Draft create_gold_state for arc_eager oracle

* Fix import

* Fix import

* Remove TokenAnnotation code from nonproj

* fixing NER one-to-many alignment

* Fix many-to-one IOB codes

* fix test for misaligned

* attempt to fix cases with weird spaces

* fix spaces

* test_gold_biluo_different_tokenization works

* allow None as BILUO annotation

* fixed some tests + WIP roundtrip unit test

* add spaces to json output format

* minibatch utiltiy can deal with strings, docs or examples

* fix augment (needs further testing)

* various fixes in scripts - needs to be further tested

* fix test_cli

* cleanup

* correct silly typo

* add support for MORPH in to/from_array, fix morphologizer overfitting test

* fix tagger

* fix entity linker

* ensure test keeps working with non-linked entities

* pipe() takes docs, not examples

* small bug fix

* textcat bugfix

* throw informative error when running the components with the wrong type of objects

* fix parser tests to work with example (most still failing)

* fix BiluoPushDown parsing entities

* small fixes

* bugfix tok2vec

* fix renames and simple_ner labels

* various small fixes

* prevent writing dummy values like deps because that could interfer with sent_start values

* fix the fix

* implement split_sent with aligned SENT_START attribute

* test for split sentences with various alignment issues, works

* Return ArcEagerGoldParse from ArcEager

* Update parser and NER gold stuff

* Draft new GoldCorpus class

* add links to to_dict

* clean up

* fix test checking for variants

* Fix oracles

* Start updating converters

* Move converters under spacy.gold

* Move things around

* Fix naming

* Fix name

* Update converter to produce DocBin

* Update converters

* Allow DocBin to take list of Doc objects.

* Make spacy convert output docbin

* Fix import

* Fix docbin

* Fix compile in ArcEager

* Fix import

* Serialize all attrs by default

* Update converter

* Remove jsonl converter

* Add json2docs converter

* Draft Corpus class for DocBin

* Work on train script

* Update Corpus

* Update DocBin

* Allocate Doc before starting to add words

* Make doc.from_array several times faster

* Update train.py

* Fix Corpus

* Fix parser model

* Start debugging arc_eager oracle

* Update header

* Fix parser declaration

* Xfail some tests

* Skip tests that cause crashes

* Skip test causing segfault

* Remove GoldCorpus

* Update imports

* Update after removing GoldCorpus

* Fix module name of corpus

* Fix mimport

* Work on parser oracle

* Update arc_eager oracle

* Restore ArcEager.get_cost function

* Update transition system

* Update test_arc_eager_oracle

* Remove beam test

* Update test

* Unskip

* Unskip tests

* add links to to_dict

* clean up

* fix test checking for variants

* Allow DocBin to take list of Doc objects.

* Fix compile in ArcEager

* Serialize all attrs by default

Move converters under spacy.gold

Move things around

Fix naming

Fix name

Update converter to produce DocBin

Update converters

Make spacy convert output docbin

Fix import

Fix docbin

Fix import

Update converter

Remove jsonl converter

Add json2docs converter

* Allocate Doc before starting to add words

* Make doc.from_array several times faster

* Start updating converters

* Work on train script

* Draft Corpus class for DocBin

Update Corpus

Fix Corpus

* Update DocBin

Add missing strings when serializing

* Update train.py

* Fix parser model

* Start debugging arc_eager oracle

* Update header

* Fix parser declaration

* Xfail some tests

Skip tests that cause crashes

Skip test causing segfault

* Remove GoldCorpus

Update imports

Update after removing GoldCorpus

Fix module name of corpus

Fix mimport

* Work on parser oracle

Update arc_eager oracle

Restore ArcEager.get_cost function

Update transition system

* Update tests

Remove beam test

Update test

Unskip

Unskip tests

* Add get_aligned_parse method in Example

Fix Example.get_aligned_parse

* Add kwargs to Corpus.dev_dataset to match train_dataset

* Update nonproj

* Use get_aligned_parse in ArcEager

* Add another arc-eager oracle test

* Remove Example.doc property

Remove Example.doc

Remove Example.doc

Remove Example.doc

Remove Example.doc

* Update ArcEager oracle

Fix Break oracle

* Debugging

* Fix Corpus

* Fix eg.doc

* Format

* small fixes

* limit arg for Corpus

* fix test_roundtrip_docs_to_docbin

* fix test_make_orth_variants

* fix add_label test

* Update tests

* avoid writing temp dir in json2docs, fixing 4402 test

* Update test

* Add missing costs to NER oracle

* Update test

* Work on Example.get_aligned_ner method

* Clean up debugging

* Xfail tests

* Remove prints

* Remove print

* Xfail some tests

* Replace unseen labels for parser

* Update test

* Update test

* Xfail test

* Fix Corpus

* fix imports

* fix docs_to_json

* various small fixes

* cleanup

* Support gold_preproc in Corpus

* Support gold_preproc

* Pass gold_preproc setting into corpus

* Remove debugging

* Fix gold_preproc

* Fix json2docs converter

* Fix convert command

* Fix flake8

* Fix import

* fix output_dir (converted to Path by typer)

* fix var

* bugfix: update states after creating golds to avoid out of bounds indexing

* Improve efficiency of ArEager oracle

* pull merge_sent into iob2docs to avoid Doc creation for each line

* fix asserts

* bugfix excl Span.end in iob2docs

* Support max_length in Corpus

* Fix arc_eager oracle

* Filter out uannotated sentences in NER

* Remove debugging in parser

* Simplify NER alignment

* Fix conversion of NER data

* Fix NER init_gold_batch

* Tweak efficiency of precomputable affine

* Update onto-json default

* Update gold test for NER

* Fix parser test

* Update test

* Add NER data test

* Fix convert for single file

* Fix test

* Hack scorer to avoid evaluating non-nered data

* Fix handling of NER data in Example

* Output unlabelled spans from O biluo tags in iob_utils

* Fix unset variable

* Return kept examples from init_gold_batch

* Return examples from init_gold_batch

* Dont return Example from init_gold_batch

* Set spaces on gold doc after conversion

* Add test

* Fix spaces reading

* Improve NER alignment

* Improve handling of missing values in NER

* Restore the 'cutting' in parser training

* Add assertion

* Print epochs

* Restore random cuts in parser/ner training

* Implement Doc.copy

* Implement Example.copy

* Copy examples at the start of Language.update

* Don't unset example docs

* Tweak parser model slightly

* attempt to fix _guess_spaces

* _add_entities_to_doc first, so that links don't get overwritten

* fixing get_aligned_ner for one-to-many

* fix indexing into x_text

* small fix biluo_tags_from_offsets

* Add onto-ner config

* Simplify NER alignment

* Fix NER scoring for partially annotated documents

* fix indexing into x_text

* fix test_cli failing tests by ignoring spans in doc.ents with empty label

* Fix limit

* Improve NER alignment

* Fix count_train

* Remove print statement

* fix tests, we're not having nothing but None

* fix clumsy fingers

* Fix tests

* Fix doc.ents

* Remove empty docs in Corpus and improve limit

* Update config

Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
											
										
										
											2020-06-26 20:34:12 +03:00
+								    @classmethod
 								    def iob_strings(cls):
 								        return ("", "I", "O", "B")
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def ent_iob_(self):
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """IOB code of named entity tag. "B" means the token begins an entity,
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        "I" means it is inside an entity, "O" means it is outside an entity,
-												Distinction between outside, missing and blocked NER annotations (#4307)

* remove duplicate unit test

* unit test (currently failing) for issue 4267

* bugfix: ensure doc.ents preserves kb_id annotations

* fix in setting doc.ents with empty label

* rename

* test for presetting an entity to a certain type

* allow overwriting Outside + blocking presets

* fix actions when previous label needs to be kept

* fix default ent_iob in set entities

* cleaner solution with U- action

* remove debugging print statements

* unit tests with explicit transitions and is_valid testing

* remove U- from move_names explicitly

* remove unit tests with pre-trained models that don't work

* remove (working) unit tests with pre-trained models

* clean up unit tests

* move unit tests

* small fixes

* remove two TODO's from doc.ents comments

											
										
										
											2019-09-18 22:37:17 +03:00
+								        and "" means no entity tag is set. "B" with an empty ent_type
 								        means that the token is blocked from further processing by NER.
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        RETURNS (str): IOB code of named entity tag.
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """
-												Improve spacy.gold (no GoldParse, no json format!) (#5555)

* Update errors

* Remove beam for now (maybe)

Remove beam_utils

Update setup.py

Remove beam

* Remove GoldParse

WIP on removing goldparse

Get ArcEager compiling after GoldParse excise

Update setup.py

Get spacy.syntax compiling after removing GoldParse

Rename NewExample -> Example and clean up

Clean html files

Start updating tests

Update Morphologizer

* fix error numbers

* fix merge conflict

* informative error when calling to_array with wrong field

* fix error catching

* fixing language and scoring tests

* start testing get_aligned

* additional tests for new get_aligned function

* Draft create_gold_state for arc_eager oracle

* Fix import

* Fix import

* Remove TokenAnnotation code from nonproj

* fixing NER one-to-many alignment

* Fix many-to-one IOB codes

* fix test for misaligned

* attempt to fix cases with weird spaces

* fix spaces

* test_gold_biluo_different_tokenization works

* allow None as BILUO annotation

* fixed some tests + WIP roundtrip unit test

* add spaces to json output format

* minibatch utiltiy can deal with strings, docs or examples

* fix augment (needs further testing)

* various fixes in scripts - needs to be further tested

* fix test_cli

* cleanup

* correct silly typo

* add support for MORPH in to/from_array, fix morphologizer overfitting test

* fix tagger

* fix entity linker

* ensure test keeps working with non-linked entities

* pipe() takes docs, not examples

* small bug fix

* textcat bugfix

* throw informative error when running the components with the wrong type of objects

* fix parser tests to work with example (most still failing)

* fix BiluoPushDown parsing entities

* small fixes

* bugfix tok2vec

* fix renames and simple_ner labels

* various small fixes

* prevent writing dummy values like deps because that could interfer with sent_start values

* fix the fix

* implement split_sent with aligned SENT_START attribute

* test for split sentences with various alignment issues, works

* Return ArcEagerGoldParse from ArcEager

* Update parser and NER gold stuff

* Draft new GoldCorpus class

* add links to to_dict

* clean up

* fix test checking for variants

* Fix oracles

* Start updating converters

* Move converters under spacy.gold

* Move things around

* Fix naming

* Fix name

* Update converter to produce DocBin

* Update converters

* Allow DocBin to take list of Doc objects.

* Make spacy convert output docbin

* Fix import

* Fix docbin

* Fix compile in ArcEager

* Fix import

* Serialize all attrs by default

* Update converter

* Remove jsonl converter

* Add json2docs converter

* Draft Corpus class for DocBin

* Work on train script

* Update Corpus

* Update DocBin

* Allocate Doc before starting to add words

* Make doc.from_array several times faster

* Update train.py

* Fix Corpus

* Fix parser model

* Start debugging arc_eager oracle

* Update header

* Fix parser declaration

* Xfail some tests

* Skip tests that cause crashes

* Skip test causing segfault

* Remove GoldCorpus

* Update imports

* Update after removing GoldCorpus

* Fix module name of corpus

* Fix mimport

* Work on parser oracle

* Update arc_eager oracle

* Restore ArcEager.get_cost function

* Update transition system

* Update test_arc_eager_oracle

* Remove beam test

* Update test

* Unskip

* Unskip tests

* add links to to_dict

* clean up

* fix test checking for variants

* Allow DocBin to take list of Doc objects.

* Fix compile in ArcEager

* Serialize all attrs by default

Move converters under spacy.gold

Move things around

Fix naming

Fix name

Update converter to produce DocBin

Update converters

Make spacy convert output docbin

Fix import

Fix docbin

Fix import

Update converter

Remove jsonl converter

Add json2docs converter

* Allocate Doc before starting to add words

* Make doc.from_array several times faster

* Start updating converters

* Work on train script

* Draft Corpus class for DocBin

Update Corpus

Fix Corpus

* Update DocBin

Add missing strings when serializing

* Update train.py

* Fix parser model

* Start debugging arc_eager oracle

* Update header

* Fix parser declaration

* Xfail some tests

Skip tests that cause crashes

Skip test causing segfault

* Remove GoldCorpus

Update imports

Update after removing GoldCorpus

Fix module name of corpus

Fix mimport

* Work on parser oracle

Update arc_eager oracle

Restore ArcEager.get_cost function

Update transition system

* Update tests

Remove beam test

Update test

Unskip

Unskip tests

* Add get_aligned_parse method in Example

Fix Example.get_aligned_parse

* Add kwargs to Corpus.dev_dataset to match train_dataset

* Update nonproj

* Use get_aligned_parse in ArcEager

* Add another arc-eager oracle test

* Remove Example.doc property

Remove Example.doc

Remove Example.doc

Remove Example.doc

Remove Example.doc

* Update ArcEager oracle

Fix Break oracle

* Debugging

* Fix Corpus

* Fix eg.doc

* Format

* small fixes

* limit arg for Corpus

* fix test_roundtrip_docs_to_docbin

* fix test_make_orth_variants

* fix add_label test

* Update tests

* avoid writing temp dir in json2docs, fixing 4402 test

* Update test

* Add missing costs to NER oracle

* Update test

* Work on Example.get_aligned_ner method

* Clean up debugging

* Xfail tests

* Remove prints

* Remove print

* Xfail some tests

* Replace unseen labels for parser

* Update test

* Update test

* Xfail test

* Fix Corpus

* fix imports

* fix docs_to_json

* various small fixes

* cleanup

* Support gold_preproc in Corpus

* Support gold_preproc

* Pass gold_preproc setting into corpus

* Remove debugging

* Fix gold_preproc

* Fix json2docs converter

* Fix convert command

* Fix flake8

* Fix import

* fix output_dir (converted to Path by typer)

* fix var

* bugfix: update states after creating golds to avoid out of bounds indexing

* Improve efficiency of ArEager oracle

* pull merge_sent into iob2docs to avoid Doc creation for each line

* fix asserts

* bugfix excl Span.end in iob2docs

* Support max_length in Corpus

* Fix arc_eager oracle

* Filter out uannotated sentences in NER

* Remove debugging in parser

* Simplify NER alignment

* Fix conversion of NER data

* Fix NER init_gold_batch

* Tweak efficiency of precomputable affine

* Update onto-json default

* Update gold test for NER

* Fix parser test

* Update test

* Add NER data test

* Fix convert for single file

* Fix test

* Hack scorer to avoid evaluating non-nered data

* Fix handling of NER data in Example

* Output unlabelled spans from O biluo tags in iob_utils

* Fix unset variable

* Return kept examples from init_gold_batch

* Return examples from init_gold_batch

* Dont return Example from init_gold_batch

* Set spaces on gold doc after conversion

* Add test

* Fix spaces reading

* Improve NER alignment

* Improve handling of missing values in NER

* Restore the 'cutting' in parser training

* Add assertion

* Print epochs

* Restore random cuts in parser/ner training

* Implement Doc.copy

* Implement Example.copy

* Copy examples at the start of Language.update

* Don't unset example docs

* Tweak parser model slightly

* attempt to fix _guess_spaces

* _add_entities_to_doc first, so that links don't get overwritten

* fixing get_aligned_ner for one-to-many

* fix indexing into x_text

* small fix biluo_tags_from_offsets

* Add onto-ner config

* Simplify NER alignment

* Fix NER scoring for partially annotated documents

* fix indexing into x_text

* fix test_cli failing tests by ignoring spans in doc.ents with empty label

* Fix limit

* Improve NER alignment

* Fix count_train

* Remove print statement

* fix tests, we're not having nothing but None

* fix clumsy fingers

* Fix tests

* Fix doc.ents

* Remove empty docs in Corpus and improve limit

* Update config

Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
											
										
										
											2020-06-26 20:34:12 +03:00
+								        return self.iob_strings()[self.c.ent_iob]
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 15:54:55 +03:00
+								    property ent_id:
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (uint64): ID of the entity the token is an instance of,
 								            if any.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 15:54:55 +03:00
+								        def __get__(self):
-												Fix token.pyx

											
										
										
											2016-09-23 16:07:07 +03:00
+								            return self.c.ent_id
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 15:54:55 +03:00
 								        def __set__(self, hash_t key):
-												Allow ent_id to be set in Token

											
										
										
											2017-03-31 15:00:14 +03:00
+								            self.c.ent_id = key
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 15:54:55 +03:00
 								    property ent_id_:
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        """RETURNS (str): ID of the entity the token is an instance of,
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								            if any.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 15:54:55 +03:00
+								        def __get__(self):
-												Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.

											
										
										
											2016-09-30 21:20:22 +03:00
+								            return self.vocab.strings[self.c.ent_id]
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 15:54:55 +03:00
-												Allow ent_id to be set in Token

											
										
										
											2017-03-31 15:00:14 +03:00
+								        def __set__(self, name):
-												Finish stringstore change. Also xfail vectors tests

											
										
										
											2017-05-28 16:10:22 +03:00
+								            self.c.ent_id = self.vocab.strings.add(name)
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 15:54:55 +03:00
-												annotate kb_id through ents in doc

											
										
										
											2019-03-14 17:48:40 +03:00
+								    property ent_kb_id:
 								        """RETURNS (uint64): Named entity KB ID."""
 								        def __get__(self):
 								            return self.c.ent_kb_id
 								        def __set__(self, attr_t ent_kb_id):
 								            self.c.ent_kb_id = ent_kb_id
 								    property ent_kb_id_:
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        """RETURNS (str): Named entity KB ID."""
-												annotate kb_id through ents in doc

											
										
										
											2019-03-14 17:48:40 +03:00
+								        def __get__(self):
 								            return self.vocab.strings[self.c.ent_kb_id]
 								        def __set__(self, ent_kb_id):
 								            self.c.ent_kb_id = self.vocab.strings.add(ent_kb_id)
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def whitespace_(self):
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        """RETURNS (str): The trailing whitespace character, if present."""
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return " " if self.c.spacy else ""
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def orth_(self):
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        """RETURNS (str): Verbatim text content (identical to
-												💫 Port master changes over to develop (#2979)

* Create aryaprabhudesai.md (#2681)

* Update _install.jade (#2688)

Typo fix: "models" -> "model"

* Add FAC to spacy.explain (resolves #2706)

* Remove docstrings for deprecated arguments (see #2703)

* When calling getoption() in conftest.py, pass a default option (#2709)

* When calling getoption() in conftest.py, pass a default option

This is necessary to allow testing an installed spacy by running:

  pytest --pyargs spacy

* Add contributor agreement

* update bengali token rules for hyphen and digits (#2731)

* Less norm computations in token similarity (#2730)

* Less norm computations in token similarity

* Contributor agreement

* Remove ')' for clarity (#2737)

Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.

* added contributor agreement for mbkupfer (#2738)

* Basic support for Telugu language (#2751)

* Lex _attrs for polish language (#2750)

* Signed spaCy contributor agreement

* Added polish version of english lex_attrs

* Introduces a bulk merge function, in order to solve issue #653 (#2696)

* Fix comment

* Introduce bulk merge to increase performance on many span merges

* Sign contributor agreement

* Implement pull request suggestions

* Describe converters more explicitly (see #2643)

* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]

* Fix formatting

* Fix dependency scheme docs (closes #2705) [ci skip]

* Don't set stop word in example (closes #2657) [ci skip]

* Add words to portuguese language _num_words (#2759)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Update Indonesian model (#2752)

* adding e-KTP in tokenizer exceptions list

* add exception token

* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception

* add tokenizer exceptions list

* combining base_norms with norm_exceptions

* adding norm_exception

* fix double key in lemmatizer

* remove unused import on punctuation.py

* reformat stop_words to reduce number of lines, improve readibility

* updating tokenizer exception

* implement is_currency for lang/id

* adding orth_first_upper in tokenizer_exceptions

* update the norm_exception list

* remove bunch of abbreviations

* adding contributors file

* Fixed spaCy+Keras example (#2763)

* bug fixes in keras example

* created contributor agreement

* Adding French hyphenated first name (#2786)

* Fix typo (closes #2784)

* Fix typo (#2795) [ci skip]

Fixed typo on line 6 "regcognizer --> recognizer"

* Adding basic support for Sinhala language. (#2788)

* adding Sinhala language package, stop words, examples and lex_attrs.

* Adding contributor agreement

* Updating contributor agreement

* Also include lowercase norm exceptions

* Fix error (#2802)

* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way.  Use the resize function

* added spaCy Contributor Agreement

* Add charlax's contributor agreement (#2805)

* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)

* Contributors agreement

* Contributors agreement

* Contributors agreement

* Add jupyter=True to displacy.render in documentation (#2806)

* Revert "Also include lowercase norm exceptions"

This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.

* Remove deprecated encoding argument to msgpack

* Set up dependency tree pattern matching skeleton (#2732)

* Fix bug when too many entity types. Fixes #2800

* Fix Python 2 test failure

* Require older msgpack-numpy

* Restore encoding arg on msgpack-numpy

* Try to fix version pin for msgpack-numpy

* Update Portuguese Language (#2790)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols

* Extended punctuation and norm_exceptions in the Portuguese language

* Correct error in spacy universe docs concerning spacy-lookup (#2814)

* Update Keras Example for (Parikh et al, 2016) implementation  (#2803)

* bug fixes in keras example

* created contributor agreement

* baseline for Parikh model

* initial version of parikh 2016 implemented

* tested asymmetric models

* fixed grevious error in normalization

* use standard SNLI test file

* begin to rework parikh example

* initial version of running example

* start to document the new version

* start to document the new version

* Update Decompositional Attention.ipynb

* fixed calls to similarity

* updated the README

* import sys package duh

* simplified indexing on mapping word to IDs

* stupid python indent error

* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround

* Fix typo (closes #2815) [ci skip]

* Update regex version dependency

* Set version to 2.0.13.dev3

* Skip seemingly problematic test

* Remove problematic test

* Try previous version of regex

* Revert "Remove problematic test"

This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.

* Unskip test

* Try older version of regex

* 💫 Update training examples and use minibatching (#2830)

<!--- Provide a general summary of your changes in the title. -->

## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.

### Types of change
enhancements

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Visual C++ link updated (#2842) (closes #2841) [ci skip]

* New landing page

* Add contribution agreement

* Correcting lang/ru/examples.py (#2845)

* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement

* Correct some grammatical inaccuracies in lang\ru\examples.py

* Move contributor agreement to separate file

* Set version to 2.0.13.dev4

* Add Persian(Farsi) language support (#2797)

* Also include lowercase norm exceptions

* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors

* Rule-based French Lemmatizer (#2818)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech 
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Set version to 2.0.13

* Fix formatting and consistency

* Update docs for new version [ci skip]

* Increment version [ci skip]

* Add info on wheels [ci skip]

* Adding "This is a sentence" example to Sinhala (#2846)

* Add wheels badge

* Update badge [ci skip]

* Update README.rst [ci skip]

* Update murmurhash pin

* Increment version to 2.0.14.dev0

* Update GPU docs for v2.0.14

* Add wheel to setup_requires

* Import prefer_gpu and require_gpu functions from Thinc

* Add tests for prefer_gpu() and require_gpu()

* Update requirements and setup.py

* Workaround bug in thinc require_gpu

* Set version to v2.0.14

* Update push-tag script

* Unhack prefer_gpu

* Require thinc 6.10.6

* Update prefer_gpu and require_gpu docs [ci skip]

* Fix specifiers for GPU

* Set version to 2.0.14.dev1

* Set version to 2.0.14

* Update Thinc version pin

* Increment version

* Fix msgpack-numpy version pin

* Increment version

* Update version to 2.0.16

* Update version [ci skip]

* Redundant ')' in the Stop words' example (#2856)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Documentation improvement regarding joblib and SO (#2867)

Some documentation improvements

## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)

### Types of change
Documentation

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* raise error when setting overlapping entities as doc.ents (#2880)

* Fix out-of-bounds access in NER training

The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!

This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.

* Change PyThaiNLP Url (#2876)

* Fix missing comma

* Add example showing a fix-up rule for space entities

* Set version to 2.0.17.dev0

* Update regex version

* Revert "Update regex version"

This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.

* Try setting older regex version, to align with conda

* Set version to 2.0.17

* Add spacy-js to universe [ci-skip]

* Add spacy-raspberry to universe (closes #2889)

* Add script to validate universe json [ci skip]

* Removed space in docs + added contributor indo (#2909)

* - removed unneeded space in documentation

* - added contributor info

* Allow input text of length up to max_length, inclusive (#2922)

* Include universe spec for spacy-wordnet component (#2919)

* feat: include universe spec for spacy-wordnet component

* chore: include spaCy contributor agreement

* Minor formatting changes [ci skip]

* Fix image [ci skip]

Twitter URL doesn't work on live site

* Check if the word is in one of the regular lists specific to each POS (#2886)

* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)

Resolves #2924.

## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)

### Types of change
bug fix

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix typo [ci skip]

* fixes symbolic link on py3 and windows (#2949)

* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948

* Update spacy/compat.py

Co-Authored-By: cicorias <cicorias@users.noreply.github.com>

* Fix formatting

* Update universe [ci skip]

* Catalan Language Support (#2940)

* Catalan language Support

* Ddding Catalan to documentation

* Sort languages alphabetically [ci skip]

* Update tests for pytest 4.x (#2965)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix regex pin to harmonize with conda (#2964)

* Update README.rst

* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)

Fixes #2976

* Fix typo

* Fix typo

* Remove duplicate file

* Require thinc 7.0.0.dev2

Fixes bug in gpu_ops that would use cupy instead of numpy on CPU

* Add missing import

* Fix error IDs

* Fix tests

											
										
										
											2018-11-29 18:30:29 +03:00
+								            `Token.text`). Exists mostly for consistency with the other
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								            attributes.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return self.vocab.strings[self.c.lex.orth]
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def lower_(self):
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        """RETURNS (str): The lowercase token text. Equivalent to
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								            `Token.text.lower()`.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return self.vocab.strings[self.c.lex.lower]
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property norm_:
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        """RETURNS (str): The token's norm, i.e. a normalised form of the
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								            token text. Usually set in the language's tokenizer exceptions or
 								            norm exceptions.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												Make NORM a token attribute (#3029)

See #3028. The solution in this patch is pretty debateable.

What we do is give the TokenC struct a .norm field, by repurposing the previously idle .sense attribute. It's nice to repurpose a previous field because it means the TokenC doesn't change size, so even if someone's using the internals very deeply, nothing will break.

The weird thing here is that the TokenC and the LexemeC both have an attribute named NORM. This arguably assists in backwards compatibility. On the other hand, maybe it's really bad! We're changing the semantics of the attribute subtly, so maybe it's better if someone calling lex.norm gets a breakage, and instead is told to write lex.default_norm?

Overall I believe this patch makes the NORM feature work the way we sort of expected it to work. Certainly it's much more like how the docs describe it, and more in line with how we've been directing people to use the norm attribute. We'll also be able to use token.norm to do stuff like spelling correction, which is pretty cool.

											
										
										
											2018-12-08 12:49:10 +03:00
+								            return self.vocab.strings[self.norm]
 								        def __set__(self, unicode norm_):
 								            self.c.norm = self.vocab.strings.add(norm_)
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def shape_(self):
-												fix 's typo's across code base (#8384)


											
										
										
											2021-06-15 11:57:08 +03:00
+								        """RETURNS (str): Transform of the token's string, to show
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								            orthographic features. For example, "Xxxx" or "dd".
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return self.vocab.strings[self.c.lex.shape]
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def prefix_(self):
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        """RETURNS (str): A length-N substring from the start of the token.
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								            Defaults to `N=1`.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return self.vocab.strings[self.c.lex.prefix]
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def suffix_(self):
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        """RETURNS (str): A length-N substring from the end of the token.
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								            Defaults to `N=3`.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return self.vocab.strings[self.c.lex.suffix]
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def lang_(self):
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        """RETURNS (str): Language of the parent document's vocabulary,
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								            e.g. 'en'.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return self.vocab.strings[self.c.lex.lang]
-												introduce lang field for LexemeC to hold language id
put noun_chunk logic into iterators.py for each language separately

											
										
										
											2016-03-10 15:01:34 +03:00
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								    property lemma_:
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        """RETURNS (str): The token lemma, i.e. the base form of the word,
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								            with no inflectional suffixes.
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												Add Lemmatizer and simplify related components (#5848)

* Add Lemmatizer and simplify related components

* Add `Lemmatizer` pipe with `lookup` and `rule` modes using the
`Lookups` tables.
* Reduce `Tagger` to a simple tagger that sets `Token.tag` (no pos or lemma)
* Reduce `Morphology` to only keep track of morph tags (no tag map, lemmatizer,
or morph rules)
* Remove lemmatizer from `Vocab`
* Adjust many many tests

Differences:

* No default lookup lemmas
* No special treatment of TAG in `from_array` and similar required
* Easier to modify labels in a `Tagger`
* No extra strings added from morphology / tag map

* Fix test

* Initial fix for Lemmatizer config/serialization

* Adjust init test to be more generic

* Adjust init test to force empty Lookups

* Add simple cache to rule-based lemmatizer

* Convert language-specific lemmatizers

Convert language-specific lemmatizers to component lemmatizers. Remove
previous lemmatizer class.

* Fix French and Polish lemmatizers

* Remove outdated UPOS conversions

* Update Russian lemmatizer init in tests

* Add minimal init/run tests for custom lemmatizers

* Add option to overwrite existing lemmas

* Update mode setting, lookup loading, and caching

* Make `mode` an immutable property
* Only enforce strict `load_lookups` for known supported modes
* Move caching into individual `_lemmatize` methods

* Implement strict when lang is not found in lookups

* Fix tables/lookups in make_lemmatizer

* Reallow provided lookups and allow for stricter checks

* Add lookups asset to all Lemmatizer pipe tests

* Rename lookups in lemmatizer init test

* Clean up merge

* Refactor lookup table loading

* Add helper from `load_lemmatizer_lookups` that loads required and
optional lookups tables based on settings provided by a config.

Additional slight refactor of lookups:

* Add `Lookups.set_table` to set a table from a provided `Table`
* Reorder class definitions to be able to specify type as `Table`

* Move registry assets into test methods

* Refactor lookups tables config

Use class methods within `Lemmatizer` to provide the config for
particular modes and to load the lookups from a config.

* Add pipe and score to lemmatizer

* Simplify Tagger.score

* Add missing import

* Clean up imports and auto-format

* Remove unused kwarg

* Tidy up and auto-format

* Update docstrings for Lemmatizer

Update docstrings for Lemmatizer.

Additionally modify `is_base_form` API to take `Token` instead of
individual features.

* Update docstrings

* Remove tag map values from Tagger.add_label

* Update API docs

* Fix relative link in Lemmatizer API docs
											
										
										
											2020-08-07 16:27:13 +03:00
+								            return self.vocab.strings[self.c.lemma]
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
-												Allow lemma to be set from Python. Re #973

											
										
										
											2017-04-16 19:07:53 +03:00
+								        def __set__(self, unicode lemma_):
-												Finish stringstore change. Also xfail vectors tests

											
										
										
											2017-05-28 16:10:22 +03:00
+								            self.c.lemma = self.vocab.strings.add(lemma_)
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property pos_:
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        """RETURNS (str): Coarse-grained part-of-speech tag."""
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												* Rename ATTR_IDS to attrs.IDS. Rename ATTR_NAMES to attrs.NAMES. Rename UNIV_POS_IDS to parts_of_speech.IDS

											
										
										
											2015-10-10 09:55:55 +03:00
+								            return parts_of_speech.NAMES[self.c.pos]
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
-												Fix #2014: token.pos_ not writeable

											
										
										
											2018-03-27 22:21:11 +03:00
+								        def __set__(self, pos_name):
-												Validate pos values when creating Doc (#9148)

* Validate pos values when creating Doc

* Add clear error when setting invalid pos

This also changes the error language slightly.

* Fix variable name

* Update spacy/tokens/doc.pyx

* Test that setting invalid pos raises an error

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
											
										
										
											2021-09-16 14:28:05 +03:00
+								            if pos_name not in parts_of_speech.IDS:
 								                raise ValueError(Errors.E1021.format(pp=pos_name))
-												Fix #2014: token.pos_ not writeable

											
										
										
											2018-03-27 22:21:11 +03:00
+								            self.c.pos = parts_of_speech.IDS[pos_name]
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property tag_:
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        """RETURNS (str): Fine-grained part-of-speech tag."""
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.

											
										
										
											2016-09-30 21:20:22 +03:00
+								            return self.vocab.strings[self.c.tag]
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
-												Fix Issue #600: Missing setters for Token attribute.

											
										
										
											2016-11-03 01:28:59 +03:00
+								        def __set__(self, tag):
-												Finish stringstore change. Also xfail vectors tests

											
										
										
											2017-05-28 16:10:22 +03:00
+								            self.tag = self.vocab.strings.add(tag)
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												cleanup

											
										
										
											2021-01-13 16:20:05 +03:00
+								    def has_dep(self):
 								        """Check whether the token has annotated dep information.
 								        Returns False when the dep label is unset/missing.
 								        RETURNS (bool): Whether the dep label is valid or not.
 								        """
 								        return not Token.missing_dep(self.c)
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								    property dep_:
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        """RETURNS (str): The syntactic dependency label."""
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.

											
										
										
											2016-09-30 21:20:22 +03:00
+								            return self.vocab.strings[self.c.dep]
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								        def __set__(self, unicode label):
-												Finish stringstore change. Also xfail vectors tests

											
										
										
											2017-05-28 16:10:22 +03:00
+								            self.c.dep = self.vocab.strings.add(label)
-												* Break up tokens.pyx into tokens/doc.pyx, tokens/token.pyx, tokens/spans.pyx

											
										
										
											2015-07-13 21:20:58 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def is_oov(self):
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (bool): Whether the token is out-of-vocabulary."""
-												Fix polarity of Token.is_oov and Lexeme.is_oov (#5634)

Fix `Token.is_oov` and `Lexeme.is_oov` so they return `True` when the
lexeme does **not** have a vector.
											
										
										
											2020-06-23 14:29:51 +03:00
+								        return self.c.lex.orth not in self.vocab.vectors
-												* Add is_oov property, and fix up handling of attributes

											
										
										
											2015-07-27 02:50:06 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def is_stop(self):
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (bool): Whether the token is a stop word, i.e. part of a
 								            "stop list" defined by the language data.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return Lexeme.c_check_flag(self.c.lex, IS_STOP)
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def is_alpha(self):
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (bool): Whether the token consists of alpha characters.
 								            Equivalent to `token.text.isalpha()`.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return Lexeme.c_check_flag(self.c.lex, IS_ALPHA)
-												* Whitespace

											
										
										
											2015-08-09 00:37:44 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def is_ascii(self):
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (bool): Whether the token consists of ASCII characters.
 								            Equivalent to `[any(ord(c) >= 128 for c in token.text)]`.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return Lexeme.c_check_flag(self.c.lex, IS_ASCII)
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def is_digit(self):
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (bool): Whether the token consists of digits. Equivalent to
 								            `token.text.isdigit()`.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return Lexeme.c_check_flag(self.c.lex, IS_DIGIT)
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def is_lower(self):
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (bool): Whether the token is in lowercase. Equivalent to
 								            `token.text.islower()`.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return Lexeme.c_check_flag(self.c.lex, IS_LOWER)
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def is_upper(self):
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (bool): Whether the token is in uppercase. Equivalent to
 								            `token.text.isupper()`
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return Lexeme.c_check_flag(self.c.lex, IS_UPPER)
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def is_title(self):
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (bool): Whether the token is in titlecase. Equivalent to
 								            `token.text.istitle()`.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return Lexeme.c_check_flag(self.c.lex, IS_TITLE)
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def is_punct(self):
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (bool): Whether the token is punctuation."""
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return Lexeme.c_check_flag(self.c.lex, IS_PUNCT)
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def is_space(self):
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (bool): Whether the token consists of whitespace characters.
 								            Equivalent to `token.text.isspace()`.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def is_bracket(self):
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (bool): Whether the token is a bracket."""
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)
-												* Add Python hooks for is_bracket/is_quote/is_left_punct/is_right_punct

											
										
										
											2016-02-04 15:04:16 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def is_quote(self):
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (bool): Whether the token is a quotation mark."""
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)
-												* Add Python hooks for is_bracket/is_quote/is_left_punct/is_right_punct

											
										
										
											2016-02-04 15:04:16 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def is_left_punct(self):
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (bool): Whether the token is a left punctuation mark."""
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)
-												* Add Python hooks for is_bracket/is_quote/is_left_punct/is_right_punct

											
										
										
											2016-02-04 15:04:16 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def is_right_punct(self):
-												Fix docstring for is_right_punct(). (#3044)


											
										
										
											2018-12-14 12:11:11 +03:00
+								        """RETURNS (bool): Whether the token is a right punctuation mark."""
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def is_currency(self):
-												added new lex feat to token

											
										
										
											2018-02-11 20:55:48 +03:00
+								        """RETURNS (bool): Whether the token is a currency symbol."""
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return Lexeme.c_check_flag(self.c.lex, IS_CURRENCY)
-												added new lex feat to token

											
										
										
											2018-02-11 20:55:48 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def like_url(self):
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (bool): Whether the token resembles a URL."""
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return Lexeme.c_check_flag(self.c.lex, LIKE_URL)
-												* Whitespace

											
										
										
											2015-08-09 00:37:44 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def like_num(self):
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (bool): Whether the token resembles a number, e.g. "10.9",
 								            "10", "ten", etc.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return Lexeme.c_check_flag(self.c.lex, LIKE_NUM)
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def like_email(self):
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        """RETURNS (bool): Whether the token resembles an email address."""
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL)