spaCy/spacy/tokens/doc.pyx


# coding: utf8
# cython: infer_types=True
# cython: bounds_check=False
# cython: profile=True
from __future__ import unicode_literals

cimport cython
cimport numpy as np
from libc.string cimport memcpy, memset
from libc.math cimport sqrt
from collections import Counter

import numpy
import numpy.linalg
import struct
import srsly
from thinc.neural.util import get_array_module, copy_array
import warnings

from .span cimport Span
from .token cimport Token
from ..lexeme cimport Lexeme, EMPTY_LEXEME
from ..typedefs cimport attr_t, flags_t
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, attr_id_t
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t

from ..attrs import intify_attrs, IDS
from ..util import normalize_slice
from ..compat import is_config, copy_reg, pickle, basestring_
from ..errors import Errors, Warnings
from .. import util
from .underscore import Underscore, get_ext_args
from ._retokenize import Retokenizer


DEF PADDING = 5


cdef int bounds_check(int i, int length, int padding) except -1:
    if (i + padding) < 0:
        raise IndexError(Errors.E026.format(i=i, length=length))
    if (i - padding) >= length:
        raise IndexError(Errors.E026.format(i=i, length=length))


cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
    if feat_name == LEMMA:
        return token.lemma
    elif feat_name == NORM:
        if not token.norm:
            return token.lex.norm
        return token.norm
    elif feat_name == POS:
        return token.pos
    elif feat_name == TAG:
        return token.tag
    elif feat_name == DEP:
        return token.dep
    elif feat_name == HEAD:
        return token.head
    elif feat_name == SENT_START:
        return token.sent_start
    elif feat_name == SPACY:
        return token.spacy
    elif feat_name == ENT_IOB:
        return token.ent_iob
    elif feat_name == ENT_TYPE:
        return token.ent_type
    elif feat_name == ENT_ID:
        return token.ent_id
    elif feat_name == ENT_KB_ID:
        return token.ent_kb_id
    elif feat_name == IDX:
        return token.idx
    else:
        return Lexeme.get_struct_attr(token.lex, feat_name)


cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name) nogil:
    if feat_name == SENT_START:
        if token.sent_start == 1:
            return True
        else:
            return False
    else:
        return get_token_attr(token, feat_name)


def _get_chunker(lang):
    try:
        cls = util.get_lang_class(lang)
    except ImportError:
        return None
    except KeyError:
        return None
    return cls.Defaults.syntax_iterators.get("noun_chunks")


cdef class Doc:
    """A sequence of Token objects. Access sentences and named entities, export
    annotations to numpy arrays, losslessly serialize to compressed binary
    strings. The `Doc` object holds an array of `TokenC` structs. The
    Python-level `Token` and `Span` objects are views of this array, i.e.
    they don't own the data themselves.

    EXAMPLE:
        Construction 1
        >>> doc = nlp(u'Some text')

        Construction 2
        >>> from spacy.tokens import Doc
        >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
        >>>           spaces=[True, False, False])

    DOCS: https://spacy.io/api/doc
    """

    @classmethod
    def set_extension(cls, name, **kwargs):
        """Define a custom attribute which becomes available as `Doc._`.

        name (unicode): Name of the attribute to set.
        default: Optional default value of the attribute.
        getter (callable): Optional getter function.
        setter (callable): Optional setter function.
        method (callable): Optional method for method extension.
        force (bool): Force overwriting existing attribute.

        DOCS: https://spacy.io/api/doc#set_extension
        USAGE: https://spacy.io/usage/processing-pipelines#custom-components-attributes
        """
        if cls.has_extension(name) and not kwargs.get("force", False):
            raise ValueError(Errors.E090.format(name=name, obj="Doc"))
        Underscore.doc_extensions[name] = get_ext_args(**kwargs)

    @classmethod
    def get_extension(cls, name):
        """Look up a previously registered extension by name.

        name (unicode): Name of the extension.
        RETURNS (tuple): A `(default, method, getter, setter)` tuple.

        DOCS: https://spacy.io/api/doc#get_extension
        """
        return Underscore.doc_extensions.get(name)

    @classmethod
    def has_extension(cls, name):
        """Check whether an extension has been registered.

        name (unicode): Name of the extension.
        RETURNS (bool): Whether the extension has been registered.

        DOCS: https://spacy.io/api/doc#has_extension
        """
        return name in Underscore.doc_extensions

    @classmethod
    def remove_extension(cls, name):
        """Remove a previously registered extension.

        name (unicode): Name of the extension.
        RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
            removed extension.

        DOCS: https://spacy.io/api/doc#remove_extension
        """
        if not cls.has_extension(name):
            raise ValueError(Errors.E046.format(name=name))
        return Underscore.doc_extensions.pop(name)

    def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None,
                 orths_and_spaces=None):
        """Create a Doc object.

        vocab (Vocab): A vocabulary object, which must match any models you
            want to use (e.g. tokenizer, parser, entity recognizer).
        words (list or None): A list of unicode strings to add to the document
            as words. If `None`, defaults to empty list.
        spaces (list or None): A list of boolean values, of the same length as
            words. True means that the word is followed by a space, False means
            it is not. If `None`, defaults to `[True]*len(words)`
        user_data (dict or None): Optional extra data to attach to the Doc.
        RETURNS (Doc): The newly constructed object.

        DOCS: https://spacy.io/api/doc#init
        """
        self.vocab = vocab
        size = 20
        self.mem = Pool()
        # Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
        # However, we need to remember the true starting places, so that we can
        # realloc.
        data_start = <TokenC*>self.mem.alloc(size + (PADDING*2), sizeof(TokenC))
        cdef int i
        for i in range(size + (PADDING*2)):
            data_start[i].lex = &EMPTY_LEXEME
            data_start[i].l_edge = i
            data_start[i].r_edge = i
        self.c = data_start + PADDING
        self.max_length = size
        self.length = 0
        self.is_tagged = False
        self.is_parsed = False
        self.sentiment = 0.0
        self.cats = {}
        self.user_hooks = {}
        self.user_token_hooks = {}
        self.user_span_hooks = {}
        self.tensor = numpy.zeros((0,), dtype="float32")
        self.user_data = {} if user_data is None else user_data
        self._vector = None
        self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
        cdef unicode orth
        cdef bint has_space
        if orths_and_spaces is None and words is not None:
            if spaces is None:
                spaces = [True] * len(words)
            elif len(spaces) != len(words):
                raise ValueError(Errors.E027)
            orths_and_spaces = zip(words, spaces)
        if orths_and_spaces is not None:
            for orth_space in orths_and_spaces:
                if isinstance(orth_space, unicode):
                    orth = orth_space
                    has_space = True
                elif isinstance(orth_space, bytes):
                    raise ValueError(Errors.E028.format(value=orth_space))
                else:
                    orth, has_space = orth_space
                # Note that we pass self.mem here --- we have ownership, if LexemeC
                # must be created.
                self.push_back(
                    <const LexemeC*>self.vocab.get(self.mem, orth), has_space)
        # Tough to decide on policy for this. Is an empty doc tagged and parsed?
        # There's no information we'd like to add to it, so I guess so?
        if self.length == 0:
            self.is_tagged = True
            self.is_parsed = True

    @property
    def _(self):
        """Custom extension attributes registered via `set_extension`."""
        return Underscore(Underscore.doc_extensions, self)

    @property
    def is_sentenced(self):
        """Check if the document has sentence boundaries assigned. This is
        defined as having at least one of the following:

        a) An entry "sents" in doc.user_hooks";
        b) Doc.is_parsed is set to True;
        c) At least one token other than the first where sent_start is not None.
        """
        if "sents" in self.user_hooks:
            return True
        if self.is_parsed:
            return True
        if len(self) < 2:
            return True
        for i in range(1, self.length):
            if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
                return True
        return False

    @property
    def is_nered(self):
        """Check if the document has named entities set. Will return True if
        *any* of the tokens has a named entity tag set (even if the others are
        unknown values), or if the document is empty.
        """
        if len(self) == 0:
            return True
        for i in range(self.length):
            if self.c[i].ent_iob != 0:
                return True
        return False

    def __getitem__(self, object i):
        """Get a `Token` or `Span` object.

        i (int or tuple) The index of the token, or the slice of the document
            to get.
        RETURNS (Token or Span): The token at `doc[i]]`, or the span at
            `doc[start : end]`.

        EXAMPLE:
            >>> doc[i]
            Get the `Token` object at position `i`, where `i` is an integer.
            Negative indexing is supported, and follows the usual Python
            semantics, i.e. `doc[-2]` is `doc[len(doc) - 2]`.

            >>> doc[start : end]]
            Get a `Span` object, starting at position `start` and ending at
            position `end`, where `start` and `end` are token indices. For
            instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and
            4. Stepped slices (e.g. `doc[start : end : step]`) are not
            supported, as `Span` objects must be contiguous (cannot have gaps).
            You can use negative indices and open-ended ranges, which have
            their normal Python semantics.

        DOCS: https://spacy.io/api/doc#getitem
        """
        if isinstance(i, slice):
            start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
            return Span(self, start, stop, label=0)
        if i < 0:
            i = self.length + i
        bounds_check(i, self.length, PADDING)
        return Token.cinit(self.vocab, &self.c[i], i, self)

    def __iter__(self):
        """Iterate over `Token`  objects, from which the annotations can be
        easily accessed. This is the main way of accessing `Token` objects,
        which are the main way annotations are accessed from Python. If faster-
        than-Python speeds are required, you can instead access the annotations
        as a numpy array, or access the underlying C data directly from Cython.

        DOCS: https://spacy.io/api/doc#iter
        """
        cdef int i
        for i in range(self.length):
            yield Token.cinit(self.vocab, &self.c[i], i, self)

    def __len__(self):
        """The number of tokens in the document.

        RETURNS (int): The number of tokens in the document.

        DOCS: https://spacy.io/api/doc#len
        """
        return self.length

    def __unicode__(self):
        return "".join([t.text_with_ws for t in self])

    def __bytes__(self):
        return "".join([t.text_with_ws for t in self]).encode("utf-8")

    def __str__(self):
        if is_config(python3=True):
            return self.__unicode__()
        return self.__bytes__()

    def __repr__(self):
        return self.__str__()

    @property
    def doc(self):
        return self

    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict"):
        """Create a `Span` object from the slice
        `doc.text[start_idx : end_idx]`. Returns None if no valid `Span` can be
        created.

        doc (Doc): The parent document.
        start_idx (int): The index of the first character of the span.
        end_idx (int): The index of the first character after the span.
        label (uint64 or string): A label to attach to the Span, e.g. for
            named entities.
        kb_id (uint64 or string):  An ID from a KB to capture the meaning of a
            named entity.
        vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
            the span.
        alignment_mode (str): How character indices are aligned to token
            boundaries. Options: "strict" (character indices must be aligned
            with token boundaries), "contract" (span of all tokens completely
            within the character span), "expand" (span of all tokens at least
            partially covered by the character span). Defaults to "strict".
        RETURNS (Span): The newly constructed object.

        DOCS: https://spacy.io/api/doc#char_span
        """
        if not isinstance(label, int):
            label = self.vocab.strings.add(label)
        if not isinstance(kb_id, int):
            kb_id = self.vocab.strings.add(kb_id)
        alignment_modes = ("strict", "contract", "expand")
        if alignment_mode not in alignment_modes:
            raise ValueError(Errors.E202.format(mode=alignment_mode, modes=", ".join(alignment_modes)))
        cdef int start = token_by_char(self.c, self.length, start_idx)
        if start < 0 or (alignment_mode == "strict" and start_idx != self[start].idx):
            return None
        # end_idx is exclusive, so find the token at one char before
        cdef int end = token_by_char(self.c, self.length, end_idx - 1)
        if end < 0 or (alignment_mode == "strict" and end_idx != self[end].idx + len(self[end])):
            return None
        # Adjust start and end by alignment_mode
        if alignment_mode == "contract":
            if self[start].idx < start_idx:
                start += 1
            if end_idx < self[end].idx + len(self[end]):
                end -= 1
            # if no tokens are completely within the span, return None
            if end < start:
                return None
        elif alignment_mode == "expand":
            # Don't consider the trailing whitespace to be part of the previous
            # token
            if start_idx == self[start].idx + len(self[start]):
                start += 1
        # Currently we have the token index, we want the range-end index
        end += 1
        cdef Span span = Span(self, start, end, label=label, kb_id=kb_id, vector=vector)
        return span

    def similarity(self, other):
        """Make a semantic similarity estimate. The default estimate is cosine
        similarity using an average of word vectors.

        other (object): The object to compare with. By default, accepts `Doc`,
            `Span`, `Token` and `Lexeme` objects.
        RETURNS (float): A scalar similarity score. Higher is more similar.

        DOCS: https://spacy.io/api/doc#similarity
        """
        if "similarity" in self.user_hooks:
            return self.user_hooks["similarity"](self, other)
        if isinstance(other, (Lexeme, Token)) and self.length == 1:
            if self.c[0].lex.orth == other.orth:
                return 1.0
        elif isinstance(other, (Span, Doc)) and len(self) == len(other):
            similar = True
            for i in range(self.length):
                if self[i].orth != other[i].orth:
                    similar = False
                    break
            if similar:
                return 1.0
        if self.vocab.vectors.n_keys == 0:
            warnings.warn(Warnings.W007.format(obj="Doc"))
        if self.vector_norm == 0 or other.vector_norm == 0:
            warnings.warn(Warnings.W008.format(obj="Doc"))
            return 0.0
        vector = self.vector
        xp = get_array_module(vector)
        return xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)

    @property
    def has_vector(self):
        """A boolean value indicating whether a word vector is associated with
        the object.

        RETURNS (bool): Whether a word vector is associated with the object.

        DOCS: https://spacy.io/api/doc#has_vector
        """
        if "has_vector" in self.user_hooks:
            return self.user_hooks["has_vector"](self)
        elif self.vocab.vectors.data.size:
            return True
        elif self.tensor.size:
            return True
        else:
            return False

    property vector:
        """A real-valued meaning representation. Defaults to an average of the
        token vectors.

        RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
            representing the document's semantics.

        DOCS: https://spacy.io/api/doc#vector
        """
        def __get__(self):
            if "vector" in self.user_hooks:
                return self.user_hooks["vector"](self)
            if self._vector is not None:
                return self._vector
            xp = get_array_module(self.vocab.vectors.data)
            if not len(self):
                self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f")
                return self._vector
            elif self.vocab.vectors.data.size > 0:
                self._vector = sum(t.vector for t in self) / len(self)
                return self._vector
            elif self.tensor.size > 0:
                self._vector = self.tensor.mean(axis=0)
                return self._vector
            else:
                return xp.zeros((self.vocab.vectors_length,), dtype="float32")

        def __set__(self, value):
            self._vector = value

    property vector_norm:
        """The L2 norm of the document's vector representation.

        RETURNS (float): The L2 norm of the vector representation.

        DOCS: https://spacy.io/api/doc#vector_norm
        """
        def __get__(self):
            if "vector_norm" in self.user_hooks:
                return self.user_hooks["vector_norm"](self)
            cdef float value
            cdef double norm = 0
            if self._vector_norm is None:
                norm = 0.0
                for value in self.vector:
                    norm += value * value
                self._vector_norm = sqrt(norm) if norm != 0 else 0
            return self._vector_norm

        def __set__(self, value):
            self._vector_norm = value

    @property
    def text(self):
        """A unicode representation of the document text.

        RETURNS (unicode): The original verbatim text of the document.
        """
        return "".join(t.text_with_ws for t in self)

    @property
    def text_with_ws(self):
        """An alias of `Doc.text`, provided for duck-type compatibility with
        `Span` and `Token`.

        RETURNS (unicode): The original verbatim text of the document.
        """
        return self.text

    property ents:
        """The named entities in the document. Returns a tuple of named entity
        `Span` objects, if the entity recognizer has been applied.

        RETURNS (tuple): Entities in the document, one `Span` per entity.

        DOCS: https://spacy.io/api/doc#ents
        """
        def __get__(self):
            cdef int i
            cdef const TokenC* token
            cdef int start = -1
            cdef attr_t label = 0
            cdef attr_t kb_id = 0
            output = []
            for i in range(self.length):
                token = &self.c[i]
                if token.ent_iob == 1:
                    if start == -1:
                        seq = ["%s|%s" % (t.text, t.ent_iob_) for t in self[i-5:i+5]]
                        raise ValueError(Errors.E093.format(seq=" ".join(seq)))
                elif token.ent_iob == 2 or token.ent_iob == 0:
                    if start != -1:
                        output.append(Span(self, start, i, label=label, kb_id=kb_id))
                    start = -1
                    label = 0
                    kb_id = 0
                elif token.ent_iob == 3:
                    if start != -1:
                        output.append(Span(self, start, i, label=label, kb_id=kb_id))
                    start = i
                    label = token.ent_type
                    kb_id = token.ent_kb_id
            if start != -1:
                output.append(Span(self, start, self.length, label=label, kb_id=kb_id))
            return tuple(output)

        def __set__(self, ents):
            # TODO:
            # 1. Test basic data-driven ORTH gazetteer
            # 2. Test more nuanced date and currency regex
            tokens_in_ents = {}
            cdef attr_t entity_type
            cdef attr_t kb_id
            cdef int ent_start, ent_end
            for ent_info in ents:
                entity_type, kb_id, ent_start, ent_end = get_entity_info(ent_info)
                for token_index in range(ent_start, ent_end):
                    if token_index in tokens_in_ents.keys():
                        raise ValueError(Errors.E103.format(
                            span1=(tokens_in_ents[token_index][0],
                                   tokens_in_ents[token_index][1],
                                   self.vocab.strings[tokens_in_ents[token_index][2]]),
                            span2=(ent_start, ent_end, self.vocab.strings[entity_type])))
                    tokens_in_ents[token_index] = (ent_start, ent_end, entity_type, kb_id)
            cdef int i
            for i in range(self.length):
                # default values
                entity_type = 0
                kb_id = 0

                # Set ent_iob to Missing (0) bij default unless this token was nered before
                ent_iob = 0
                if self.c[i].ent_iob != 0:
                    ent_iob = 2

                # overwrite if the token was part of a specified entity
                if i in tokens_in_ents.keys():
                    ent_start, ent_end, entity_type, kb_id = tokens_in_ents[i]
                    if entity_type is None or entity_type <= 0:
                        # Blocking this token from being overwritten by downstream NER
                        ent_iob = 3
                    elif ent_start == i:
                        # Marking the start of an entity
                        ent_iob = 3
                    else:
                        # Marking the inside of an entity
                        ent_iob = 1

                self.c[i].ent_type = entity_type
                self.c[i].ent_kb_id = kb_id
                self.c[i].ent_iob = ent_iob

    @property
    def noun_chunks(self):
        """Iterate over the base noun phrases in the document. Yields base
        noun-phrase #[code Span] objects, if the document has been
        syntactically parsed. A base noun phrase, or "NP chunk", is a noun
        phrase that does not permit other NPs to be nested within it – so no
        NP-level coordination, no prepositional phrases, and no relative
        clauses.

        YIELDS (Span): Noun chunks in the document.

        DOCS: https://spacy.io/api/doc#noun_chunks
        """
        
        # Accumulate the result before beginning to iterate over it. This
        # prevents the tokenisation from being changed out from under us
        # during the iteration. The tricky thing here is that Span accepts
        # its tokenisation changing, so it's okay once we have the Span
        # objects. See Issue #375.
        spans = []
        if self.noun_chunks_iterator is not None:
            for start, end, label in self.noun_chunks_iterator(self):
                spans.append(Span(self, start, end, label=label))
        for span in spans:
            yield span

    @property
    def sents(self):
        """Iterate over the sentences in the document. Yields sentence `Span`
        objects. Sentence spans have no label. To improve accuracy on informal
        texts, spaCy calculates sentence boundaries from the syntactic
        dependency parse. If the parser is disabled, the `sents` iterator will
        be unavailable.

        YIELDS (Span): Sentences in the document.

        DOCS: https://spacy.io/api/doc#sents
        """
        if not self.is_sentenced:
            raise ValueError(Errors.E030)
        if "sents" in self.user_hooks:
            yield from self.user_hooks["sents"](self)
        else:
            start = 0
            for i in range(1, self.length):
                if self.c[i].sent_start == 1:
                    yield Span(self, start, i)
                    start = i
            if start != self.length:
                yield Span(self, start, self.length)

    @property
    def lang(self):
        """RETURNS (uint64): ID of the language of the doc's vocabulary."""
        return self.vocab.strings[self.vocab.lang]

    @property
    def lang_(self):
        """RETURNS (unicode): Language of the doc's vocabulary, e.g. 'en'."""
        return self.vocab.lang

    cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
        if self.length == 0:
            # Flip these to false when we see the first token.
            self.is_tagged = False
            self.is_parsed = False
        if self.length == self.max_length:
            self._realloc(self.length * 2)
        cdef TokenC* t = &self.c[self.length]
        if LexemeOrToken is const_TokenC_ptr:
            t[0] = lex_or_tok[0]
        else:
            t.lex = lex_or_tok
        if self.length == 0:
            t.idx = 0
        else:
            t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy
        t.l_edge = self.length
        t.r_edge = self.length
        if t.lex.orth == 0:
            raise ValueError(Errors.E031.format(i=self.length))
        t.spacy = has_space
        self.length += 1
        if self.length == 1:
            # Set token.sent_start to 1 for first token. See issue #2869
            self.c[0].sent_start = 1
        return t.idx + t.lex.length + t.spacy

    @cython.boundscheck(False)
    cpdef np.ndarray to_array(self, object py_attr_ids):
        """Export given token attributes to a numpy `ndarray`.
        If `attr_ids` is a sequence of M attributes, the output array will be
        of shape `(N, M)`, where N is the length of the `Doc` (in tokens). If
        `attr_ids` is a single attribute, the output shape will be (N,). You
        can specify attributes by integer ID (e.g. spacy.attrs.LEMMA) or
        string name (e.g. 'LEMMA' or 'lemma').

        attr_ids (list[]): A list of attributes (int IDs or string names).
        RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
            per word, and one column per attribute indicated in the input
            `attr_ids`.

        EXAMPLE:
            >>> from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
            >>> doc = nlp(text)
            >>> # All strings mapped to integers, for easy export to numpy
            >>> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
        """
        cdef int i, j
        cdef attr_id_t feature
        cdef np.ndarray[attr_t, ndim=2] output
        # Handle scalar/list inputs of strings/ints for py_attr_ids
        # See also #3064
        if isinstance(py_attr_ids, basestring_):
            # Handle inputs like doc.to_array('ORTH')
            py_attr_ids = [py_attr_ids]
        elif not hasattr(py_attr_ids, "__iter__"):
            # Handle inputs like doc.to_array(ORTH)
            py_attr_ids = [py_attr_ids]
        # Allow strings, e.g. 'lemma' or 'LEMMA'
        py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
                       for id_ in py_attr_ids]
        # Make an array from the attributes --- otherwise our inner loop is
        # Python dict iteration.
        cdef np.ndarray attr_ids = numpy.asarray(py_attr_ids, dtype="i")
        output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
        c_output = <attr_t*>output.data
        c_attr_ids = <attr_id_t*>attr_ids.data
        cdef TokenC* token
        cdef int nr_attr = attr_ids.shape[0]
        for i in range(self.length):
            token = &self.c[i]
            for j in range(nr_attr):
                c_output[i*nr_attr + j] = get_token_attr(token, c_attr_ids[j])
        # Handle 1d case
        return output if len(attr_ids) >= 2 else output.reshape((self.length,))

    def count_by(self, attr_id_t attr_id, exclude=None, object counts=None):
        """Count the frequencies of a given attribute. Produces a dict of
        `{attribute (int): count (ints)}` frequencies, keyed by the values of
        the given attribute ID.

        attr_id (int): The attribute ID to key the counts.
        RETURNS (dict): A dictionary mapping attributes to integer counts.

        DOCS: https://spacy.io/api/doc#count_by
        """
        cdef int i
        cdef attr_t attr
        cdef size_t count

        if counts is None:
            counts = Counter()
            output_dict = True
        else:
            output_dict = False
        # Take this check out of the loop, for a bit of extra speed
        if exclude is None:
            for i in range(self.length):
                counts[get_token_attr(&self.c[i], attr_id)] += 1
        else:
            for i in range(self.length):
                if not exclude(self[i]):
                    counts[get_token_attr(&self.c[i], attr_id)] += 1
        if output_dict:
            return dict(counts)

    def _realloc(self, new_size):
        self.max_length = new_size
        n = new_size + (PADDING * 2)
        # What we're storing is a "padded" array. We've jumped forward PADDING
        # places, and are storing the pointer to that. This way, we can access
        # words out-of-bounds, and get out-of-bounds markers.
        # Now that we want to realloc, we need the address of the true start,
        # so we jump the pointer back PADDING places.
        cdef TokenC* data_start = self.c - PADDING
        data_start = <TokenC*>self.mem.realloc(data_start, n * sizeof(TokenC))
        self.c = data_start + PADDING
        cdef int i
        for i in range(self.length, self.max_length + PADDING):
            self.c[i].lex = &EMPTY_LEXEME

    cdef void set_parse(self, const TokenC* parsed) nogil:
        # TODO: This method is fairly misleading atm. It's used by Parser
        # to actually apply the parse calculated. Need to rethink this.
        # Probably we should use from_array?
        self.is_parsed = True
        for i in range(self.length):
            self.c[i] = parsed[i]

    def from_array(self, attrs, array):
        """Load attributes from a numpy array. Write to a `Doc` object, from an
        `(M, N)` array of attributes.

        attrs (list) A list of attribute ID ints.
        array (numpy.ndarray[ndim=2, dtype='int32']): The attribute values.
        RETURNS (Doc): Itself.

        DOCS: https://spacy.io/api/doc#from_array
        """
        # Handle scalar/list inputs of strings/ints for py_attr_ids
        # See also #3064
        if isinstance(attrs, basestring_):
            # Handle inputs like doc.to_array('ORTH')
            attrs = [attrs]
        elif not hasattr(attrs, "__iter__"):
            # Handle inputs like doc.to_array(ORTH)
            attrs = [attrs]
        # Allow strings, e.g. 'lemma' or 'LEMMA'
        attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
                 for id_ in attrs]
        if array.dtype != numpy.uint64:
            warnings.warn(Warnings.W028.format(type=array.dtype))

        if SENT_START in attrs and HEAD in attrs:
            raise ValueError(Errors.E032)
        cdef int i, col, abs_head_index
        cdef attr_id_t attr_id
        cdef TokenC* tokens = self.c
        cdef int length = len(array)
        # Get set up for fast loading
        cdef Pool mem = Pool()
        cdef int n_attrs = len(attrs)
        # attrs should not be empty, but make sure to avoid zero-length mem alloc
        assert n_attrs > 0
        attr_ids = <attr_id_t*>mem.alloc(n_attrs, sizeof(attr_id_t))
        for i, attr_id in enumerate(attrs):
            attr_ids[i] = attr_id
        if len(array.shape) == 1:
            array = array.reshape((array.size, 1))
        # Check that all heads are within the document bounds
        if HEAD in attrs:
            col = attrs.index(HEAD)
            for i in range(length):
                # cast index to signed int
                abs_head_index = numpy.int32(array[i, col]) + i
                if abs_head_index < 0 or abs_head_index >= length:
                    raise ValueError(Errors.E190.format(index=i, value=array[i, col], rel_head_index=numpy.int32(array[i, col])))
        # Do TAG first. This lets subsequent loop override stuff like POS, LEMMA
        if TAG in attrs:
            col = attrs.index(TAG)
            for i in range(length):
                if array[i, col] != 0:
                    self.vocab.morphology.assign_tag(&tokens[i], array[i, col])
        # Now load the data
        for i in range(length):
            token = &self.c[i]
            for j in range(n_attrs):
                if attr_ids[j] != TAG:
                    Token.set_struct_attr(token, attr_ids[j], array[i, j])
        # Set flags
        self.is_parsed = bool(self.is_parsed or HEAD in attrs)
        self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs)
        # If document is parsed, set children
        if self.is_parsed:
            set_children_from_heads(self.c, length)
        return self

    def get_lca_matrix(self):
        """Calculates a matrix of Lowest Common Ancestors (LCA) for a given
        `Doc`, where LCA[i, j] is the index of the lowest common ancestor among
        token i and j.

        RETURNS (np.array[ndim=2, dtype=numpy.int32]): LCA matrix with shape
            (n, n), where n = len(self).

        DOCS: https://spacy.io/api/doc#get_lca_matrix
        """
        return numpy.asarray(_get_lca_matrix(self, 0, len(self)))

    def to_disk(self, path, **kwargs):
        """Save the current state to a directory.

        path (unicode or Path): A path to a directory, which will be created if
            it doesn't exist. Paths may be either strings or Path-like objects.
        exclude (list): String names of serialization fields to exclude.

        DOCS: https://spacy.io/api/doc#to_disk
        """
        path = util.ensure_path(path)
        with path.open("wb") as file_:
            file_.write(self.to_bytes(**kwargs))

    def from_disk(self, path, **kwargs):
        """Loads state from a directory. Modifies the object in place and
        returns it.

        path (unicode or Path): A path to a directory. Paths may be either
            strings or `Path`-like objects.
        exclude (list): String names of serialization fields to exclude.
        RETURNS (Doc): The modified `Doc` object.

        DOCS: https://spacy.io/api/doc#from_disk
        """
        path = util.ensure_path(path)
        with path.open("rb") as file_:
            bytes_data = file_.read()
        return self.from_bytes(bytes_data, **kwargs)

    def to_bytes(self, exclude=tuple(), **kwargs):
        """Serialize, i.e. export the document contents to a binary string.

        exclude (list): String names of serialization fields to exclude.
        RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
            all annotations.

        DOCS: https://spacy.io/api/doc#to_bytes
        """
        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, NORM, ENT_KB_ID]
        if self.is_tagged:
            array_head.extend([TAG, POS])
        # If doc parsed add head and dep attribute
        if self.is_parsed:
            array_head.extend([HEAD, DEP])
        # Otherwise add sent_start
        else:
            array_head.append(SENT_START)
        strings = set()
        for token in self:
            strings.add(token.tag_)
            strings.add(token.lemma_)
            strings.add(token.dep_)
            strings.add(token.ent_type_)
            strings.add(token.ent_kb_id_)
            strings.add(token.ent_id_)
            strings.add(token.norm_)
        # Msgpack doesn't distinguish between lists and tuples, which is
        # vexing for user data. As a best guess, we *know* that within
        # keys, we must have tuples. In values we just have to hope
        # users don't mind getting a list instead of a tuple.
        serializers = {
            "text": lambda: self.text,
            "array_head": lambda: array_head,
            "array_body": lambda: self.to_array(array_head),
            "sentiment": lambda: self.sentiment,
            "tensor": lambda: self.tensor,
            "cats": lambda: self.cats,
            "strings": lambda: list(strings),
        }
        for key in kwargs:
            if key in serializers or key in ("user_data", "user_data_keys", "user_data_values"):
                raise ValueError(Errors.E128.format(arg=key))
        if "user_data" not in exclude and self.user_data:
            user_data_keys, user_data_values = list(zip(*self.user_data.items()))
            if "user_data_keys" not in exclude:
                serializers["user_data_keys"] = lambda: srsly.msgpack_dumps(user_data_keys)
            if "user_data_values" not in exclude:
                serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values)
        return util.to_bytes(serializers, exclude)

    def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
        """Deserialize, i.e. import the document contents from a binary string.

        data (bytes): The string to load from.
        exclude (list): String names of serialization fields to exclude.
        RETURNS (Doc): Itself.

        DOCS: https://spacy.io/api/doc#from_bytes
        """
        if self.length != 0:
            raise ValueError(Errors.E033.format(length=self.length))
        deserializers = {
            "text": lambda b: None,
            "array_head": lambda b: None,
            "array_body": lambda b: None,
            "sentiment": lambda b: None,
            "tensor": lambda b: None,
            "cats": lambda b: None,
            "strings": lambda b: None,
            "user_data_keys": lambda b: None,
            "user_data_values": lambda b: None,
        }
        for key in kwargs:
            if key in deserializers or key in ("user_data",):
                raise ValueError(Errors.E128.format(arg=key))
        msg = util.from_bytes(bytes_data, deserializers, exclude)
        # Msgpack doesn't distinguish between lists and tuples, which is
        # vexing for user data. As a best guess, we *know* that within
        # keys, we must have tuples. In values we just have to hope
        # users don't mind getting a list instead of a tuple.
        if "user_data" not in exclude and "user_data_keys" in msg:
            user_data_keys = srsly.msgpack_loads(msg["user_data_keys"], use_list=False)
            user_data_values = srsly.msgpack_loads(msg["user_data_values"])
            for key, value in zip(user_data_keys, user_data_values):
                self.user_data[key] = value
        cdef int i, start, end, has_space
        if "sentiment" not in exclude and "sentiment" in msg:
            self.sentiment = msg["sentiment"]
        if "tensor" not in exclude and "tensor" in msg:
            self.tensor = msg["tensor"]
        if "cats" not in exclude and "cats" in msg:
            self.cats = msg["cats"]
        if "strings" not in exclude and "strings" in msg:
            for s in msg["strings"]:
                self.vocab.strings.add(s)
        start = 0
        cdef const LexemeC* lex
        cdef unicode orth_
        text = msg["text"]
        attrs = msg["array_body"]
        for i in range(attrs.shape[0]):
            end = start + attrs[i, 0]
            has_space = attrs[i, 1]
            orth_ = text[start:end]
            lex = self.vocab.get(self.mem, orth_)
            self.push_back(lex, has_space)
            start = end + has_space
        self.from_array(msg["array_head"][2:], attrs[:, 2:])
        return self

    def extend_tensor(self, tensor):
        """Concatenate a new tensor onto the doc.tensor object.

        The doc.tensor attribute holds dense feature vectors
        computed by the models in the pipeline. Let's say a
        document with 30 words has a tensor with 128 dimensions
        per word. doc.tensor.shape will be (30, 128). After
        calling doc.extend_tensor with an array of shape (30, 64),
        doc.tensor == (30, 192).
        """
        xp = get_array_module(self.tensor)
        if self.tensor.size == 0:
            self.tensor.resize(tensor.shape, refcheck=False)
            copy_array(self.tensor, tensor)
        else:
            self.tensor = xp.hstack((self.tensor, tensor))

    def retokenize(self):
        """Context manager to handle retokenization of the Doc.
        Modifications to the Doc's tokenization are stored, and then
        made all at once when the context manager exits. This is
        much more efficient, and less error-prone.

        All views of the Doc (Span and Token) created before the
        retokenization are invalidated, although they may accidentally
        continue to work.

        DOCS: https://spacy.io/api/doc#retokenize
        USAGE: https://spacy.io/usage/linguistic-features#retokenization
        """
        return Retokenizer(self)

    def _bulk_merge(self, spans, attributes):
        """Retokenize the document, such that the spans given as arguments
         are merged into single tokens. The spans need to be in document
         order, and no span intersection is allowed.

        spans (Span[]): Spans to merge, in document order, with all span
            intersections empty. Cannot be empty.
        attributes (Dictionary[]): Attributes to assign to the merged tokens. By default,
            must be the same length as spans, empty dictionaries are allowed.
            attributes are inherited from the syntactic root of the span.
        RETURNS (Token): The first newly merged token.
        """
        cdef unicode tag, lemma, ent_type
        attr_len = len(attributes)
        span_len = len(spans)
        if not attr_len == span_len:
            raise ValueError(Errors.E121.format(attr_len=attr_len, span_len=span_len))
        with self.retokenize() as retokenizer:
            for i, span in enumerate(spans):
                fix_attributes(self, attributes[i])
                remove_label_if_necessary(attributes[i])
                retokenizer.merge(span, attributes[i])

    def merge(self, int start_idx, int end_idx, *args, **attributes):
        """Retokenize the document, such that the span at
        `doc.text[start_idx : end_idx]` is merged into a single token. If
        `start_idx` and `end_idx `do not mark start and end token boundaries,
        the document remains unchanged.

        start_idx (int): Character index of the start of the slice to merge.
        end_idx (int): Character index after the end of the slice to merge.
        **attributes: Attributes to assign to the merged token. By default,
            attributes are inherited from the syntactic root of the span.
        RETURNS (Token): The newly merged token, or `None` if the start and end
            indices did not fall at token boundaries.
        """
        cdef unicode tag, lemma, ent_type
        warnings.warn(Warnings.W013.format(obj="Doc"), DeprecationWarning)
        # TODO: ENT_KB_ID ?
        if len(args) == 3:
            warnings.warn(Warnings.W003, DeprecationWarning)
            tag, lemma, ent_type = args
            attributes[TAG] = tag
            attributes[LEMMA] = lemma
            attributes[ENT_TYPE] = ent_type
        elif not args:
            fix_attributes(self, attributes)
        elif args:
            raise ValueError(Errors.E034.format(n_args=len(args), args=repr(args),
                                                kwargs=repr(attributes)))
        remove_label_if_necessary(attributes)
        attributes = intify_attrs(attributes, strings_map=self.vocab.strings)
        cdef int start = token_by_start(self.c, self.length, start_idx)
        if start == -1:
            return None
        cdef int end = token_by_end(self.c, self.length, end_idx)
        if end == -1:
            return None
        # Currently we have the token index, we want the range-end index
        end += 1
        with self.retokenize() as retokenizer:
            retokenizer.merge(self[start:end], attrs=attributes)
        return self[start]

    def print_tree(self, light=False, flat=False):
        raise ValueError(Errors.E105)

    def to_json(self, underscore=None):
        """Convert a Doc to JSON. The format it produces will be the new format
        for the `spacy train` command (not implemented yet).

        underscore (list): Optional list of string names of custom doc._.
        attributes. Attribute values need to be JSON-serializable. Values will
        be added to an "_" key in the data, e.g. "_": {"foo": "bar"}.
        RETURNS (dict): The data in spaCy's JSON format.

        DOCS: https://spacy.io/api/doc#to_json
        """
        data = {"text": self.text}
        if self.is_nered:
            data["ents"] = [{"start": ent.start_char, "end": ent.end_char,
                            "label": ent.label_} for ent in self.ents]
        if self.is_sentenced:
            sents = list(self.sents)
            data["sents"] = [{"start": sent.start_char, "end": sent.end_char}
                             for sent in sents]
        if self.cats:
            data["cats"] = self.cats
        data["tokens"] = []
        for token in self:
            token_data = {"id": token.i, "start": token.idx, "end": token.idx + len(token)}
            if self.is_tagged:
                token_data["pos"] = token.pos_
                token_data["tag"] = token.tag_
            if self.is_parsed:
                token_data["dep"] = token.dep_
                token_data["head"] = token.head.i
            data["tokens"].append(token_data)
        if underscore:
            data["_"] = {}
            for attr in underscore:
                if not self.has_extension(attr):
                    raise ValueError(Errors.E106.format(attr=attr, opts=underscore))
                value = self._.get(attr)
                if not srsly.is_json_serializable(value):
                    raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
                data["_"][attr] = value
        return data

    def to_utf8_array(self, int nr_char=-1):
        """Encode word strings to utf8, and export to a fixed-width array
        of characters. Characters are placed into the array in the order:
            0, -1, 1, -2, etc
        For example, if the array is sliced array[:, :8], the array will
        contain the first 4 characters and last 4 characters of each word ---
        with the middle characters clipped out. The value 255 is used as a pad
        value.
        """
        byte_strings = [token.orth_.encode('utf8') for token in self]
        if nr_char == -1:
            nr_char = max(len(bs) for bs in byte_strings)
        cdef np.ndarray output = numpy.zeros((len(byte_strings), nr_char), dtype='uint8')
        output.fill(255)
        cdef int i, j, start_idx, end_idx
        cdef bytes byte_string
        cdef unsigned char utf8_char
        for i, byte_string in enumerate(byte_strings):
            j = 0
            start_idx = 0
            end_idx = len(byte_string) - 1
            while j < nr_char and start_idx <= end_idx:
                output[i, j] = <unsigned char>byte_string[start_idx]
                start_idx += 1
                j += 1
                if j < nr_char and start_idx <= end_idx:
                    output[i, j] = <unsigned char>byte_string[end_idx]
                    end_idx -= 1
                    j += 1
        return output


cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
    cdef int i = token_by_char(tokens, length, start_char)
    if i >= 0 and tokens[i].idx == start_char:
        return i
    else:
        return -1


cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2:
    # end_char is exclusive, so find the token at one char before
    cdef int i = token_by_char(tokens, length, end_char - 1)
    if i >= 0 and tokens[i].idx + tokens[i].lex.length == end_char:
        return i
    else:
        return -1


cdef int token_by_char(const TokenC* tokens, int length, int char_idx) except -2:
    cdef int start = 0, mid, end = length - 1
    while start <= end:
        mid = (start + end) / 2
        if char_idx < tokens[mid].idx:
            end = mid - 1
        elif char_idx >= tokens[mid].idx + tokens[mid].lex.length + tokens[mid].spacy:
            start = mid + 1
        else:
            return mid
    return -1


cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
    cdef TokenC* head
    cdef TokenC* child
    cdef int i
    # Set number of left/right children to 0. We'll increment it in the loops.
    for i in range(length):
        tokens[i].l_kids = 0
        tokens[i].r_kids = 0
        tokens[i].l_edge = i
        tokens[i].r_edge = i
    cdef int loop_count = 0
    cdef bint heads_within_sents = False
    # Try up to 10 iterations of adjusting lr_kids and lr_edges in order to
    # handle non-projective dependency parses, stopping when all heads are
    # within their respective sentence boundaries. We have documented cases
    # that need at least 4 iterations, so this is to be on the safe side
    # without risking getting stuck in an infinite loop if something is
    # terribly malformed.
    while not heads_within_sents:
        heads_within_sents = _set_lr_kids_and_edges(tokens, length, loop_count)
        if loop_count > 10:
            warnings.warn(Warnings.W026)
            break
        loop_count += 1
    # Set sentence starts
    for i in range(length):
        if tokens[i].head == 0 and tokens[i].dep != 0:
            tokens[tokens[i].l_edge].sent_start = True


cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) except -1:
    # May be called multiple times due to non-projectivity. See issues #3170
    # and #4688.
    # Set left edges
    cdef TokenC* head
    cdef TokenC* child
    cdef int i, j
    for i in range(length):
        child = &tokens[i]
        head = &tokens[i + child.head]
        if child < head and loop_count == 0:
            head.l_kids += 1
        if child.l_edge < head.l_edge:
            head.l_edge = child.l_edge
        if child.r_edge > head.r_edge:
            head.r_edge = child.r_edge
    # Set right edges - same as above, but iterate in reverse
    for i in range(length-1, -1, -1):
        child = &tokens[i]
        head = &tokens[i + child.head]
        if child > head and loop_count == 0:
            head.r_kids += 1
        if child.r_edge > head.r_edge:
            head.r_edge = child.r_edge
        if child.l_edge < head.l_edge:
            head.l_edge = child.l_edge
    # Get sentence start positions according to current state
    sent_starts = set()
    for i in range(length):
        if tokens[i].head == 0 and tokens[i].dep != 0:
            sent_starts.add(tokens[i].l_edge)
    cdef int curr_sent_start = 0
    cdef int curr_sent_end = 0
    # Check whether any heads are not within the current sentence
    for i in range(length):
        if (i > 0 and i in sent_starts) or i == length - 1:
            curr_sent_end = i
            for j in range(curr_sent_start, curr_sent_end):
                if tokens[j].head + j < curr_sent_start or tokens[j].head + j >= curr_sent_end + 1:
                    return False
            curr_sent_start = i
    return True


cdef int _get_tokens_lca(Token token_j, Token token_k):
    """Given two tokens, returns the index of the lowest common ancestor
    (LCA) among the two. If they have no common ancestor, -1 is returned.

    token_j (Token): a token.
    token_k (Token): another token.
    RETURNS (int): index of lowest common ancestor, or -1 if the tokens
        have no common ancestor.
    """
    if token_j == token_k:
        return token_j.i
    elif token_j.head == token_k:
        return token_k.i
    elif token_k.head == token_j:
        return token_j.i
    token_j_ancestors = set(token_j.ancestors)
    if token_k in token_j_ancestors:
        return token_k.i
    for token_k_ancestor in token_k.ancestors:
        if token_k_ancestor == token_j:
            return token_j.i
        if token_k_ancestor in token_j_ancestors:
            return token_k_ancestor.i
    return -1


cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
    """Given a doc and a start and end position defining a set of contiguous
    tokens within it, returns a matrix of Lowest Common Ancestors (LCA), where
    LCA[i, j] is the index of the lowest common ancestor among token i and j.
    If the tokens have no common ancestor within the specified span,
    LCA[i, j] will be -1.

    doc (Doc): The index of the token, or the slice of the document
    start (int): First token to be included in the LCA matrix.
    end (int): Position of next to last token included in the LCA matrix.
    RETURNS (int [:, :]): memoryview of numpy.array[ndim=2, dtype=numpy.int32],
        with shape (n, n), where n = len(doc).
    """
    cdef int [:,:] lca_matrix
    n_tokens= end - start
    lca_mat = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32)
    lca_mat.fill(-1)
    lca_matrix = lca_mat
    for j in range(n_tokens):
        token_j = doc[start + j]
        # the common ancestor of token and itself is itself:
        lca_matrix[j, j] = j
        # we will only iterate through tokens in the same sentence
        sent = token_j.sent
        sent_start = sent.start
        j_idx_in_sent = start + j - sent_start
        n_missing_tokens_in_sent = len(sent) - j_idx_in_sent
        # make sure we do not go past `end`, in cases where `end` < sent.end
        max_range = min(j + n_missing_tokens_in_sent, end - start)
        for k in range(j + 1, max_range):
            lca = _get_tokens_lca(token_j, doc[start + k])
            # if lca is outside of span, we set it to -1
            if not start <= lca < end:
                lca_matrix[j, k] = -1
                lca_matrix[k, j] = -1
            else:
                lca_matrix[j, k] = lca - start
                lca_matrix[k, j] = lca - start
    return lca_matrix


def pickle_doc(doc):
    bytes_data = doc.to_bytes(exclude=["vocab", "user_data"])
    hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks,
                      doc.user_token_hooks)
    return (unpickle_doc, (doc.vocab, srsly.pickle_dumps(hooks_and_data), bytes_data))


def unpickle_doc(vocab, hooks_and_data, bytes_data):
    user_data, doc_hooks, span_hooks, token_hooks = srsly.pickle_loads(hooks_and_data)

    doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data, exclude=["user_data"])
    doc.user_hooks.update(doc_hooks)
    doc.user_span_hooks.update(span_hooks)
    doc.user_token_hooks.update(token_hooks)
    return doc


copy_reg.pickle(Doc, pickle_doc, unpickle_doc)


def remove_label_if_necessary(attributes):
    # More deprecated attribute handling =/
    if "label" in attributes:
        attributes["ent_type"] = attributes.pop("label")


def fix_attributes(doc, attributes):
    if "label" in attributes and "ent_type" not in attributes:
        if isinstance(attributes["label"], int):
            attributes[ENT_TYPE] = attributes["label"]
        else:
            attributes[ENT_TYPE] = doc.vocab.strings[attributes["label"]]
    if "ent_type" in attributes:
        attributes[ENT_TYPE] = attributes["ent_type"]


def get_entity_info(ent_info):
    if isinstance(ent_info, Span):
        ent_type = ent_info.label
        ent_kb_id = ent_info.kb_id
        start = ent_info.start
        end = ent_info.end
    elif len(ent_info) == 3:
        ent_type, start, end = ent_info
        ent_kb_id = 0
    elif len(ent_info) == 4:
        ent_type, ent_kb_id, start, end = ent_info
    else:
        ent_id, ent_kb_id, ent_type, start, end = ent_info
    return ent_type, ent_kb_id, start, end
-												Fix issue 2396 (#3089)

* Test on #2396: bug in Doc.get_lca_matrix()

* reimplementation of Doc.get_lca_matrix(), (closes #2396)

* reimplement Span.get_lca_matrix(), and call it from Doc.get_lca_matrix()

* tests Span.get_lca_matrix() as well as Doc.get_lca_matrix()

* implement _get_lca_matrix as a helper function in doc.pyx; call it from Doc.get_lca_matrix and Span.get_lca_matrix

* use memory view instead of np.ndarray in _get_lca_matrix (faster)

* fix bug when calling Span.get_lca_matrix; return lca matrix as np.array instead of memoryview

* cleaner conditional, add comment

											
										
										
											2018-12-29 20:02:26 +03:00
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								# coding: utf8
-												Fix serializer

											
										
										
											2017-05-09 19:45:18 +03:00
+								# cython: infer_types=True
 								# cython: bounds_check=False
-												Improve efficiency of Doc.to_array

											
										
										
											2017-11-17 20:55:56 +03:00
+								# cython: profile=True
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								from __future__ import unicode_literals
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								cimport cython
 								cimport numpy as np
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								from libc.string cimport memcpy, memset
 								from libc.math cimport sqrt
-												counter instead of preshcounter

											
										
										
											2019-07-11 14:05:53 +03:00
+								from collections import Counter
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
+								import numpy
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
+								import numpy.linalg
-												* Serialization round trip now working with decent API, but with rough spots in the organisation and requiring vocabulary to be fixed ahead of time.

											
										
										
											2015-07-19 16:18:17 +03:00
+								import struct
-												Fix removabl of dill (for srsly)

											
										
										
											2018-12-06 20:46:09 +03:00
+								import srsly
-												Add Doc.extend_tensor() method

											
										
										
											2017-11-03 13:20:31 +03:00
+								from thinc.neural.util import get_array_module, copy_array
-												Simplify warnings

											
										
										
											2020-04-28 14:37:37 +03:00
+								import warnings
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
-												Tidy up imports

											
										
										
											2017-05-13 14:04:40 +03:00
+								from .span cimport Span
 								from .token cimport Token
 								from ..lexeme cimport Lexeme, EMPTY_LEXEME
-												* Move serialization functionality out into a Serializer object

											
										
										
											2015-07-16 12:21:44 +03:00
+								from ..typedefs cimport attr_t, flags_t
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
 								from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
-												make idx available via to_array (#5030)


											
										
										
											2020-02-22 16:13:06 +03:00
+								from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, attr_id_t
-												Tidy up imports

											
										
										
											2017-05-13 14:04:40 +03:00
+								from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
 								from ..attrs import intify_attrs, IDS
-												💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)

Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉

See here: https://github.com/explosion/srsly

    Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.

    At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.

    srsly currently includes forks of the following packages:

        ujson
        msgpack
        msgpack-numpy
        cloudpickle



* WIP: replace json/ujson with srsly

* Replace ujson in examples

Use regular json instead of srsly to make code easier to read and follow

* Update requirements

* Fix imports

* Fix typos

* Replace msgpack with srsly

* Fix warning

											
										
										
											2018-12-03 03:28:22 +03:00
+								from ..util import normalize_slice
-												Fix Doc.to_array when only one string attr provided

											
										
										
											2017-11-01 15:25:44 +03:00
+								from ..compat import is_config, copy_reg, pickle, basestring_
-												💫 Add .similarity warnings for no vectors and option to exclude warnings (#2197)

* Add logic to filter out warning IDs via environment variable

Usage: SPACY_WARNING_EXCLUDE=W001,W007

* Add warnings for empty vectors

* Add warning if no word vectors are used in .similarity methods

For example, if only tensors are available in small models – should hopefully clear up some confusion around this

* Capture warnings in tests

* Rename SPACY_WARNING_EXCLUDE to SPACY_WARNING_IGNORE

											
										
										
											2018-05-21 02:22:38 +03:00
+								from ..errors import Errors, Warnings
-												Try using tensor for vector/similarity methdos

											
										
										
											2017-05-31 00:35:17 +03:00
+								from .. import util
-												Don't raise error if set_extension has getter and setter (closes #2177)

Improve error messages, raise error if setter is specified without a getter and compare against _unset to allow default=None. Also add more tests.

											
										
										
											2018-04-03 19:30:17 +03:00
+								from .underscore import Underscore, get_ext_args
-												Add doc.retokenize() context manager (#2172)

This patch takes a step towards #1487 by introducing the
doc.retokenize() context manager, to handle merging spans, and soon
splitting tokens.

The idea is to do merging and splitting like this:

with doc.retokenize() as retokenizer:
    for start, end, label in matches:
        retokenizer.merge(doc[start : end], attrs={'ent_type': label})

The retokenizer accumulates the merge requests, and applies them
together at the end of the block. This will allow retokenization to be
more efficient, and much less error prone.

A retokenizer.split() function will then be added, to handle splitting a
single token into multiple tokens. These methods take `Span` and `Token`
objects; if the user wants to go directly from offsets, they can append
to the .merges and .splits lists on the retokenizer.

The doc.merge() method's behaviour remains unchanged, so this patch
should be 100% backwards incompatible (modulo bugs). Internally,
doc.merge() fixes up the arguments (to handle the various deprecated styles),
opens the retokenizer, and makes the single merge.

We can later start making deprecation warnings on direct calls to doc.merge(),
to migrate people to use of the retokenize context manager.
											
										
										
											2018-04-03 15:10:35 +03:00
+								from ._retokenize import Retokenizer
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
+								DEF PADDING = 5
 								cdef int bounds_check(int i, int length, int padding) except -1:
 								    if (i + padding) < 0:
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								        raise IndexError(Errors.E026.format(i=i, length=length))
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
+								    if (i - padding) >= length:
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								        raise IndexError(Errors.E026.format(i=i, length=length))
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
 								cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
 								    if feat_name == LEMMA:
 								        return token.lemma
-												fetch norm from lex if necessary for matching (#4080)


											
										
										
											2019-08-06 00:51:04 +03:00
+								    elif feat_name == NORM:
 								        if not token.norm:
 								            return token.lex.norm
 								        return token.norm
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
+								    elif feat_name == POS:
 								        return token.pos
 								    elif feat_name == TAG:
 								        return token.tag
 								    elif feat_name == DEP:
 								        return token.dep
-												* Fix import of attrs in doc.pyx, and update the get_token_attr function.

											
										
										
											2015-07-16 02:15:34 +03:00
+								    elif feat_name == HEAD:
 								        return token.head
-												Add SENT_START attribute, for custom sentence boundary detection

											
										
										
											2016-05-05 13:11:57 +03:00
+								    elif feat_name == SENT_START:
 								        return token.sent_start
-												* Fix import of attrs in doc.pyx, and update the get_token_attr function.

											
										
										
											2015-07-16 02:15:34 +03:00
+								    elif feat_name == SPACY:
 								        return token.spacy
 								    elif feat_name == ENT_IOB:
 								        return token.ent_iob
 								    elif feat_name == ENT_TYPE:
 								        return token.ent_type
-												serialize ENT_ID (#4852)

* expand serialization test for custom token attribute

* add failing test for issue 4849

* define ENT_ID as attr and use in doc serialization

* fix few typos

											
										
										
											2020-01-06 16:57:34 +03:00
+								    elif feat_name == ENT_ID:
 								        return token.ent_id
-												ensure Span.as_doc keeps the entity links + unit test

											
										
										
											2019-06-25 16:28:51 +03:00
+								    elif feat_name == ENT_KB_ID:
 								        return token.ent_kb_id
-												make idx available via to_array (#5030)


											
										
										
											2020-02-22 16:13:06 +03:00
+								    elif feat_name == IDX:
 								        return token.idx
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
+								    else:
-												* Begin merge of Gazetteer and DE branches

											
										
										
											2015-09-06 20:45:15 +03:00
+								        return Lexeme.get_struct_attr(token.lex, feat_name)
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
-												Pass extensions into Underscore class

											
										
										
											2017-10-07 19:56:01 +03:00
-												Normalize TokenC.sent_start values for Matcher (#5346)

Normalize TokenC.sent_start values to booleans for the `Matcher`.
											
										
										
											2020-04-29 13:57:30 +03:00
+								cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name) nogil:
 								    if feat_name == SENT_START:
 								        if token.sent_start == 1:
 								            return True
 								        else:
 								            return False
 								    else:
 								        return get_token_attr(token, feat_name)
-												Improve way noun chunks iterator is looked up

											
										
										
											2017-06-04 22:53:39 +03:00
+								def _get_chunker(lang):
-												Fix fetching of noun chunk iterator

											
										
										
											2017-06-04 23:53:05 +03:00
+								    try:
 								        cls = util.get_lang_class(lang)
 								    except ImportError:
 								        return None
 								    except KeyError:
 								        return None
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								    return cls.Defaults.syntax_iterators.get("noun_chunks")
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
-												Pass extensions into Underscore class

											
										
										
											2017-10-07 19:56:01 +03:00
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
+								cdef class Doc:
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
+								    """A sequence of Token objects. Access sentences and named entities, export
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								    annotations to numpy arrays, losslessly serialize to compressed binary
 								    strings. The `Doc` object holds an array of `TokenC` structs. The
 								    Python-level `Token` and `Span` objects are views of this array, i.e.
 								    they don't own the data themselves.
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
-												Update tokenizer and doc init example  (#3939)

* Fix Doc.to_json hyperlink

* Update tokenizer and doc init examples

* Change "matchin rules" to "punctuation rules"

* Auto-format

											
										
										
											2019-07-10 11:16:48 +03:00
+								    EXAMPLE:
 								        Construction 1
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
+								        >>> doc = nlp(u'Some text')
 								        Construction 2
 								        >>> from spacy.tokens import Doc
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
-												Update tokenizer and doc init example  (#3939)

* Fix Doc.to_json hyperlink

* Update tokenizer and doc init examples

* Change "matchin rules" to "punctuation rules"

* Auto-format

											
										
										
											2019-07-10 11:16:48 +03:00
+								        >>>           spaces=[True, False, False])
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
 								    DOCS: https://spacy.io/api/doc
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
+								    """
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
-												Pass extensions into Underscore class

											
										
										
											2017-10-07 19:56:01 +03:00
+								    @classmethod
-												Don't raise error if set_extension has getter and setter (closes #2177)

Improve error messages, raise error if setter is specified without a getter and compare against _unset to allow default=None. Also add more tests.

											
										
										
											2018-04-03 19:30:17 +03:00
+								    def set_extension(cls, name, **kwargs):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        """Define a custom attribute which becomes available as `Doc._`.
 								        name (unicode): Name of the attribute to set.
 								        default: Optional default value of the attribute.
 								        getter (callable): Optional getter function.
 								        setter (callable): Optional setter function.
 								        method (callable): Optional method for method extension.
 								        force (bool): Force overwriting existing attribute.
 								        DOCS: https://spacy.io/api/doc#set_extension
 								        USAGE: https://spacy.io/usage/processing-pipelines#custom-components-attributes
 								        """
 								        if cls.has_extension(name) and not kwargs.get("force", False):
 								            raise ValueError(Errors.E090.format(name=name, obj="Doc"))
-												Don't raise error if set_extension has getter and setter (closes #2177)

Improve error messages, raise error if setter is specified without a getter and compare against _unset to allow default=None. Also add more tests.

											
										
										
											2018-04-03 19:30:17 +03:00
+								        Underscore.doc_extensions[name] = get_ext_args(**kwargs)
-												Pass extensions into Underscore class

											
										
										
											2017-10-07 19:56:01 +03:00
 								    @classmethod
 								    def get_extension(cls, name):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        """Look up a previously registered extension by name.
 								        name (unicode): Name of the extension.
 								        RETURNS (tuple): A `(default, method, getter, setter)` tuple.
 								        DOCS: https://spacy.io/api/doc#get_extension
 								        """
-												Pass extensions into Underscore class

											
										
										
											2017-10-07 19:56:01 +03:00
+								        return Underscore.doc_extensions.get(name)
 								    @classmethod
 								    def has_extension(cls, name):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        """Check whether an extension has been registered.
 								        name (unicode): Name of the extension.
 								        RETURNS (bool): Whether the extension has been registered.
 								        DOCS: https://spacy.io/api/doc#has_extension
 								        """
-												Pass extensions into Underscore class

											
										
										
											2017-10-07 19:56:01 +03:00
+								        return name in Underscore.doc_extensions
-												Add remove_extension method on Doc, Token and Span (closes #2242)

											
										
										
											2018-04-29 00:33:09 +03:00
+								    @classmethod
 								    def remove_extension(cls, name):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        """Remove a previously registered extension.
 								        name (unicode): Name of the extension.
 								        RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
 								            removed extension.
 								        DOCS: https://spacy.io/api/doc#remove_extension
 								        """
-												Add remove_extension method on Doc, Token and Span (closes #2242)

											
										
										
											2018-04-29 00:33:09 +03:00
+								        if not cls.has_extension(name):
 								            raise ValueError(Errors.E046.format(name=name))
 								        return Underscore.doc_extensions.pop(name)
-												Fix Doc pickling. This also removes need for Binder class

											
										
										
											2017-10-17 17:11:13 +03:00
+								    def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None,
 								                 orths_and_spaces=None):
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
+								        """Create a Doc object.
-												Improve docstrings for Doc object

											
										
										
											2016-09-28 12:15:13 +03:00
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        vocab (Vocab): A vocabulary object, which must match any models you
 								            want to use (e.g. tokenizer, parser, entity recognizer).
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
+								        words (list or None): A list of unicode strings to add to the document
 								            as words. If `None`, defaults to empty list.
 								        spaces (list or None): A list of boolean values, of the same length as
 								            words. True means that the word is followed by a space, False means
 								            it is not. If `None`, defaults to `[True]*len(words)`
-												Fix Doc pickling. This also removes need for Binder class

											
										
										
											2017-10-17 17:11:13 +03:00
+								        user_data (dict or None): Optional extra data to attach to the Doc.
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
+								        RETURNS (Doc): The newly constructed object.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
 								        DOCS: https://spacy.io/api/doc#init
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
+								        self.vocab = vocab
 								        size = 20
 								        self.mem = Pool()
 								        # Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
 								        # However, we need to remember the true starting places, so that we can
 								        # realloc.
 								        data_start = <TokenC*>self.mem.alloc(size + (PADDING*2), sizeof(TokenC))
 								        cdef int i
 								        for i in range(size + (PADDING*2)):
 								            data_start[i].lex = &EMPTY_LEXEME
-												* Fix L/R edge bug, by ensuring l_edge and r_edge are preset, and fixing the way the edge update in del_arc. Bugs keep arising here because the edges are absolute positions, where everything else is relative. I'm also not 100% convinced that del_arc is handled correctly. Do we need to update the parents?

											
										
										
											2015-09-09 04:39:46 +03:00
+								            data_start[i].l_edge = i
 								            data_start[i].r_edge = i
-												* Rename Doc.data to Doc.c

											
										
										
											2015-11-03 16:15:14 +03:00
+								        self.c = data_start + PADDING
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
+								        self.max_length = size
 								        self.length = 0
 								        self.is_tagged = False
 								        self.is_parsed = False
-												Add sentiment field to doc, rename getters_for_tokens and getters_for_spans, add user_hooks field to Doc.

											
										
										
											2016-10-19 21:54:03 +03:00
+								        self.sentiment = 0.0
-												Add slot for text categories to Doc

											
										
										
											2017-07-22 01:34:15 +03:00
+								        self.cats = {}
-												Fix hook names in doc

											
										
										
											2016-10-19 22:15:16 +03:00
+								        self.user_hooks = {}
 								        self.user_token_hooks = {}
 								        self.user_span_hooks = {}
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        self.tensor = numpy.zeros((0,), dtype="float32")
-												Fix Doc pickling. This also removes need for Binder class

											
										
										
											2017-10-17 17:11:13 +03:00
+								        self.user_data = {} if user_data is None else user_data
-												* Try giving Doc and Span objects vector and vector_norm attributes, and .similarity functions. Turns out to be bad idea.

											
										
										
											2015-09-17 04:50:11 +03:00
+								        self._vector = None
-												Improve way noun chunks iterator is looked up

											
										
										
											2017-06-04 22:53:39 +03:00
+								        self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
-												Fix orths_and_spaces in Doc.__init__

											
										
										
											2016-09-21 15:52:05 +03:00
+								        cdef unicode orth
 								        cdef bint has_space
-												Add words and spaces keyword arguments to Doc.

											
										
										
											2016-10-16 19:13:03 +03:00
+								        if orths_and_spaces is None and words is not None:
 								            if spaces is None:
 								                spaces = [True] * len(words)
-												Add input error handling in Doc

											
										
										
											2016-10-16 19:16:42 +03:00
+								            elif len(spaces) != len(words):
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								                raise ValueError(Errors.E027)
-												Add words and spaces keyword arguments to Doc.

											
										
										
											2016-10-16 19:13:03 +03:00
+								            orths_and_spaces = zip(words, spaces)
-												Fix orths_and_spaces in Doc.__init__

											
										
										
											2016-09-21 15:52:05 +03:00
+								        if orths_and_spaces is not None:
 								            for orth_space in orths_and_spaces:
 								                if isinstance(orth_space, unicode):
 								                    orth = orth_space
 								                    has_space = True
 								                elif isinstance(orth_space, bytes):
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								                    raise ValueError(Errors.E028.format(value=orth_space))
-												Fix orths_and_spaces in Doc.__init__

											
										
										
											2016-09-21 15:52:05 +03:00
+								                else:
 								                    orth, has_space = orth_space
 								                # Note that we pass self.mem here --- we have ownership, if LexemeC
 								                # must be created.
 								                self.push_back(
 								                    <const LexemeC*>self.vocab.get(self.mem, orth), has_space)
-												Fix Issue #599, by considering empty documents to be parsed and tagged. Implementation is a bit dodgy.

											
										
										
											2016-11-03 01:47:46 +03:00
+								        # Tough to decide on policy for this. Is an empty doc tagged and parsed?
 								        # There's no information we'd like to add to it, so I guess so?
 								        if self.length == 0:
 								            self.is_tagged = True
 								            self.is_parsed = True
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
-												Pass extensions into Underscore class

											
										
										
											2017-10-07 19:56:01 +03:00
+								    @property
 								    def _(self):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        """Custom extension attributes registered via `set_extension`."""
-												Pass extensions into Underscore class

											
										
										
											2017-10-07 19:56:01 +03:00
+								        return Underscore(Underscore.doc_extensions, self)
-												Add doc.retokenize() context manager (#2172)

This patch takes a step towards #1487 by introducing the
doc.retokenize() context manager, to handle merging spans, and soon
splitting tokens.

The idea is to do merging and splitting like this:

with doc.retokenize() as retokenizer:
    for start, end, label in matches:
        retokenizer.merge(doc[start : end], attrs={'ent_type': label})

The retokenizer accumulates the merge requests, and applies them
together at the end of the block. This will allow retokenization to be
more efficient, and much less error prone.

A retokenizer.split() function will then be added, to handle splitting a
single token into multiple tokens. These methods take `Span` and `Token`
objects; if the user wants to go directly from offsets, they can append
to the .merges and .splits lists on the retokenizer.

The doc.merge() method's behaviour remains unchanged, so this patch
should be 100% backwards incompatible (modulo bugs). Internally,
doc.merge() fixes up the arguments (to handle the various deprecated styles),
opens the retokenizer, and makes the single merge.

We can later start making deprecation warnings on direct calls to doc.merge(),
to migrate people to use of the retokenize context manager.
											
										
										
											2018-04-03 15:10:35 +03:00
+								    @property
 								    def is_sentenced(self):
-												Make doc[0].is_sent_start == True (closes #2869) (#3340)

* Make doc[0] have sent_start True. Closes #2869

* Document that doc[0].is_sent_start defaults True.

											
										
										
											2019-02-27 13:17:17 +03:00
+								        """Check if the document has sentence boundaries assigned. This is
 								        defined as having at least one of the following:
 								        a) An entry "sents" in doc.user_hooks";
-												💫 Add better and serializable sentencizer (#3471)

* Add better serializable sentencizer component

* Replace default factory

* Add tests

* Tidy up

* Pass test

* Update docs

											
										
										
											2019-03-23 17:45:02 +03:00
+								        b) Doc.is_parsed is set to True;
-												Make doc[0].is_sent_start == True (closes #2869) (#3340)

* Make doc[0] have sent_start True. Closes #2869

* Document that doc[0].is_sent_start defaults True.

											
										
										
											2019-02-27 13:17:17 +03:00
+								        c) At least one token other than the first where sent_start is not None.
 								        """
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        if "sents" in self.user_hooks:
-												Add doc.retokenize() context manager (#2172)

This patch takes a step towards #1487 by introducing the
doc.retokenize() context manager, to handle merging spans, and soon
splitting tokens.

The idea is to do merging and splitting like this:

with doc.retokenize() as retokenizer:
    for start, end, label in matches:
        retokenizer.merge(doc[start : end], attrs={'ent_type': label})

The retokenizer accumulates the merge requests, and applies them
together at the end of the block. This will allow retokenization to be
more efficient, and much less error prone.

A retokenizer.split() function will then be added, to handle splitting a
single token into multiple tokens. These methods take `Span` and `Token`
objects; if the user wants to go directly from offsets, they can append
to the .merges and .splits lists on the retokenizer.

The doc.merge() method's behaviour remains unchanged, so this patch
should be 100% backwards incompatible (modulo bugs). Internally,
doc.merge() fixes up the arguments (to handle the various deprecated styles),
opens the retokenizer, and makes the single merge.

We can later start making deprecation warnings on direct calls to doc.merge(),
to migrate people to use of the retokenize context manager.
											
										
										
											2018-04-03 15:10:35 +03:00
+								            return True
 								        if self.is_parsed:
 								            return True
-												Return True from doc.is_... when no ambiguity

* Make doc.is_sentenced return True if len(doc) < 2.

* Make doc.is_nered return True if len(doc) == 0, for consistency.

Closes #3934

											
										
										
											2019-07-10 20:21:23 +03:00
+								        if len(self) < 2:
 								            return True
-												Make doc[0].is_sent_start == True (closes #2869) (#3340)

* Make doc[0] have sent_start True. Closes #2869

* Document that doc[0].is_sent_start defaults True.

											
										
										
											2019-02-27 13:17:17 +03:00
+								        for i in range(1, self.length):
-												Add doc.retokenize() context manager (#2172)

This patch takes a step towards #1487 by introducing the
doc.retokenize() context manager, to handle merging spans, and soon
splitting tokens.

The idea is to do merging and splitting like this:

with doc.retokenize() as retokenizer:
    for start, end, label in matches:
        retokenizer.merge(doc[start : end], attrs={'ent_type': label})

The retokenizer accumulates the merge requests, and applies them
together at the end of the block. This will allow retokenization to be
more efficient, and much less error prone.

A retokenizer.split() function will then be added, to handle splitting a
single token into multiple tokens. These methods take `Span` and `Token`
objects; if the user wants to go directly from offsets, they can append
to the .merges and .splits lists on the retokenizer.

The doc.merge() method's behaviour remains unchanged, so this patch
should be 100% backwards incompatible (modulo bugs). Internally,
doc.merge() fixes up the arguments (to handle the various deprecated styles),
opens the retokenizer, and makes the single merge.

We can later start making deprecation warnings on direct calls to doc.merge(),
to migrate people to use of the retokenize context manager.
											
										
										
											2018-04-03 15:10:35 +03:00
+								            if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
 								                return True
-												💫 Improve Doc.to_json and add Doc.is_nered (#3381)

* Use default return instead of else

* Add Doc.is_nered to indicate if entities have been set

* Add properties in Doc.to_json if they were set, not if they're available

This way, if a processed Doc exports "pos": None, it means that the tag was explicitly unset. If it exports "ents": [], it means that entity annotations are available but that this document doesn't contain any entities. Before, this would have been unclear and problematic for training.

											
										
										
											2019-03-10 17:24:34 +03:00
+								        return False
 								    @property
 								    def is_nered(self):
 								        """Check if the document has named entities set. Will return True if
 								        *any* of the tokens has a named entity tag set (even if the others are
-												additional information if doc is empty

											
										
										
											2020-03-09 20:08:18 +03:00
+								        unknown values), or if the document is empty.
-												💫 Improve Doc.to_json and add Doc.is_nered (#3381)

* Use default return instead of else

* Add Doc.is_nered to indicate if entities have been set

* Add properties in Doc.to_json if they were set, not if they're available

This way, if a processed Doc exports "pos": None, it means that the tag was explicitly unset. If it exports "ents": [], it means that entity annotations are available but that this document doesn't contain any entities. Before, this would have been unclear and problematic for training.

											
										
										
											2019-03-10 17:24:34 +03:00
+								        """
-												Return True from doc.is_... when no ambiguity

* Make doc.is_sentenced return True if len(doc) < 2.

* Make doc.is_nered return True if len(doc) == 0, for consistency.

Closes #3934

											
										
										
											2019-07-10 20:21:23 +03:00
+								        if len(self) == 0:
 								            return True
-												💫 Improve Doc.to_json and add Doc.is_nered (#3381)

* Use default return instead of else

* Add Doc.is_nered to indicate if entities have been set

* Add properties in Doc.to_json if they were set, not if they're available

This way, if a processed Doc exports "pos": None, it means that the tag was explicitly unset. If it exports "ents": [], it means that entity annotations are available but that this document doesn't contain any entities. Before, this would have been unclear and problematic for training.

											
										
										
											2019-03-10 17:24:34 +03:00
+								        for i in range(self.length):
 								            if self.c[i].ent_iob != 0:
 								                return True
 								        return False
-												Add doc.retokenize() context manager (#2172)

This patch takes a step towards #1487 by introducing the
doc.retokenize() context manager, to handle merging spans, and soon
splitting tokens.

The idea is to do merging and splitting like this:

with doc.retokenize() as retokenizer:
    for start, end, label in matches:
        retokenizer.merge(doc[start : end], attrs={'ent_type': label})

The retokenizer accumulates the merge requests, and applies them
together at the end of the block. This will allow retokenization to be
more efficient, and much less error prone.

A retokenizer.split() function will then be added, to handle splitting a
single token into multiple tokens. These methods take `Span` and `Token`
objects; if the user wants to go directly from offsets, they can append
to the .merges and .splits lists on the retokenizer.

The doc.merge() method's behaviour remains unchanged, so this patch
should be 100% backwards incompatible (modulo bugs). Internally,
doc.merge() fixes up the arguments (to handle the various deprecated styles),
opens the retokenizer, and makes the single merge.

We can later start making deprecation warnings on direct calls to doc.merge(),
to migrate people to use of the retokenize context manager.
											
										
										
											2018-04-03 15:10:35 +03:00
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
+								    def __getitem__(self, object i):
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
+								        """Get a `Token` or `Span` object.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        i (int or tuple) The index of the token, or the slice of the document
 								            to get.
-												Update docstring for Doc.__getitem__

											
										
										
											2017-05-19 01:30:51 +03:00
+								        RETURNS (Token or Span): The token at `doc[i]]`, or the span at
 								            `doc[start : end]`.
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
+								        EXAMPLE:
 								            >>> doc[i]
 								            Get the `Token` object at position `i`, where `i` is an integer.
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
+								            Negative indexing is supported, and follows the usual Python
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
+								            semantics, i.e. `doc[-2]` is `doc[len(doc) - 2]`.
 								            >>> doc[start : end]]
 								            Get a `Span` object, starting at position `start` and ending at
 								            position `end`, where `start` and `end` are token indices. For
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								            instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and
 . Stepped slices (e.g. `doc[start : end : step]`) are not
 								            supported, as `Span` objects must be contiguous (cannot have gaps).
 								            You can use negative indices and open-ended ranges, which have
 								            their normal Python semantics.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
 								        DOCS: https://spacy.io/api/doc#getitem
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
+								        if isinstance(i, slice):
-												Refactor to remove duplicate slicing logic

											
										
										
											2015-10-07 11:25:35 +03:00
+								            start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
 								            return Span(self, start, stop, label=0)
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
+								        if i < 0:
 								            i = self.length + i
 								        bounds_check(i, self.length, PADDING)
-												Remove caching of Token in Doc, as caused cycle.

											
										
										
											2017-10-16 20:34:21 +03:00
+								        return Token.cinit(self.vocab, &self.c[i], i, self)
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
 								    def __iter__(self):
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
+								        """Iterate over `Token`  objects, from which the annotations can be
 								        easily accessed. This is the main way of accessing `Token` objects,
 								        which are the main way annotations are accessed from Python. If faster-
 								        than-Python speeds are required, you can instead access the annotations
 								        as a numpy array, or access the underlying C data directly from Cython.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        DOCS: https://spacy.io/api/doc#iter
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												* Make minor efficiency improvement in Doc.__iter__

											
										
										
											2015-07-18 05:10:53 +03:00
+								        cdef int i
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
+								        for i in range(self.length):
-												Remove caching of Token in Doc, as caused cycle.

											
										
										
											2017-10-16 20:34:21 +03:00
+								            yield Token.cinit(self.vocab, &self.c[i], i, self)
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
 								    def __len__(self):
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
+								        """The number of tokens in the document.
-												Update docstrings and API docs for Doc

											
										
										
											2017-05-19 19:47:39 +03:00
+								        RETURNS (int): The number of tokens in the document.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        DOCS: https://spacy.io/api/doc#len
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
+								        return self.length
 								    def __unicode__(self):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        return "".join([t.text_with_ws for t in self])
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
-												fixed error when printing unicode

											
										
										
											2015-11-02 21:22:18 +03:00
+								    def __bytes__(self):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        return "".join([t.text_with_ws for t in self]).encode("utf-8")
-												fixed error when printing unicode

											
										
										
											2015-11-02 21:22:18 +03:00
-												* Fix string coercion for Python 3

											
										
										
											2015-07-24 04:49:30 +03:00
+								    def __str__(self):
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        if is_config(python3=True):
-												fixed error when printing unicode

											
										
										
											2015-11-02 21:22:18 +03:00
+								            return self.__unicode__()
 								        return self.__bytes__()
-												* Fix string coercion for Python 3

											
										
										
											2015-07-24 04:49:30 +03:00
-												added __repr__ that prints text in ipython for doc, token, and span objects

											
										
										
											2015-10-21 14:11:46 +03:00
+								    def __repr__(self):
-												fixed error when printing unicode

											
										
										
											2015-11-02 21:22:18 +03:00
+								        return self.__str__()
-												added __repr__ that prints text in ipython for doc, token, and span objects

											
										
										
											2015-10-21 14:11:46 +03:00
-												Add noun_chunks to Span

											
										
										
											2016-11-24 13:47:20 +03:00
+								    @property
 								    def doc(self):
 								        return self
-												Allow Doc.char_span to snap to token boundaries (#5849)

* Allow Doc.char_span to snap to token boundaries

Add a `mode` option to allow `Doc.char_span` to snap to token
boundaries. The `mode` options:

* `strict`: character offsets must match token boundaries (default, same as
before)
* `inside`: all tokens completely within the character span
* `outside`: all tokens at least partially covered by the character span

Add a new helper function `token_by_char` that returns the token
corresponding to a character position in the text. Update
`token_by_start` and `token_by_end` to use `token_by_char` for more
efficient searching.

* Remove unused import

* Rename mode to alignment_mode

Rename `mode` to `alignment_mode` with the options
`strict`/`contract`/`expand`. Any unrecognized modes are silently
converted to `strict`.
											
										
										
											2020-08-04 14:36:32 +03:00
+								    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict"):
 								        """Create a `Span` object from the slice
 								        `doc.text[start_idx : end_idx]`. Returns None if no valid `Span` can be
 								        created.
-												Add Doc.char_span method, to get a span by character offset

											
										
										
											2017-08-19 13:21:09 +03:00
 								        doc (Doc): The parent document.
-												Allow Doc.char_span to snap to token boundaries (#5849)

* Allow Doc.char_span to snap to token boundaries

Add a `mode` option to allow `Doc.char_span` to snap to token
boundaries. The `mode` options:

* `strict`: character offsets must match token boundaries (default, same as
before)
* `inside`: all tokens completely within the character span
* `outside`: all tokens at least partially covered by the character span

Add a new helper function `token_by_char` that returns the token
corresponding to a character position in the text. Update
`token_by_start` and `token_by_end` to use `token_by_char` for more
efficient searching.

* Remove unused import

* Rename mode to alignment_mode

Rename `mode` to `alignment_mode` with the options
`strict`/`contract`/`expand`. Any unrecognized modes are silently
converted to `strict`.
											
										
										
											2020-08-04 14:36:32 +03:00
+								        start_idx (int): The index of the first character of the span.
 								        end_idx (int): The index of the first character after the span.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        label (uint64 or string): A label to attach to the Span, e.g. for
 								            named entities.
-												Allow Doc.char_span to snap to token boundaries (#5849)

* Allow Doc.char_span to snap to token boundaries

Add a `mode` option to allow `Doc.char_span` to snap to token
boundaries. The `mode` options:

* `strict`: character offsets must match token boundaries (default, same as
before)
* `inside`: all tokens completely within the character span
* `outside`: all tokens at least partially covered by the character span

Add a new helper function `token_by_char` that returns the token
corresponding to a character position in the text. Update
`token_by_start` and `token_by_end` to use `token_by_char` for more
efficient searching.

* Remove unused import

* Rename mode to alignment_mode

Rename `mode` to `alignment_mode` with the options
`strict`/`contract`/`expand`. Any unrecognized modes are silently
converted to `strict`.
											
										
										
											2020-08-04 14:36:32 +03:00
+								        kb_id (uint64 or string):  An ID from a KB to capture the meaning of a
 								            named entity.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
 								            the span.
-												Allow Doc.char_span to snap to token boundaries (#5849)

* Allow Doc.char_span to snap to token boundaries

Add a `mode` option to allow `Doc.char_span` to snap to token
boundaries. The `mode` options:

* `strict`: character offsets must match token boundaries (default, same as
before)
* `inside`: all tokens completely within the character span
* `outside`: all tokens at least partially covered by the character span

Add a new helper function `token_by_char` that returns the token
corresponding to a character position in the text. Update
`token_by_start` and `token_by_end` to use `token_by_char` for more
efficient searching.

* Remove unused import

* Rename mode to alignment_mode

Rename `mode` to `alignment_mode` with the options
`strict`/`contract`/`expand`. Any unrecognized modes are silently
converted to `strict`.
											
										
										
											2020-08-04 14:36:32 +03:00
+								        alignment_mode (str): How character indices are aligned to token
 								            boundaries. Options: "strict" (character indices must be aligned
 								            with token boundaries), "contract" (span of all tokens completely
 								            within the character span), "expand" (span of all tokens at least
 								            partially covered by the character span). Defaults to "strict".
-												Add Doc.char_span method, to get a span by character offset

											
										
										
											2017-08-19 13:21:09 +03:00
+								        RETURNS (Span): The newly constructed object.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
 								        DOCS: https://spacy.io/api/doc#char_span
-												Add Doc.char_span method, to get a span by character offset

											
										
										
											2017-08-19 13:21:09 +03:00
+								        """
-												Allow span label to be string in Doc.char_span

											
										
										
											2017-08-19 17:18:09 +03:00
+								        if not isinstance(label, int):
 								            label = self.vocab.strings.add(label)
-												annotate kb_id through ents in doc

											
										
										
											2019-03-14 17:48:40 +03:00
+								        if not isinstance(kb_id, int):
 								            kb_id = self.vocab.strings.add(kb_id)
-												Add alignment mode error and fix Doc.char_span docs (#6820)

* Raise an error on an unrecognized alignment mode rather than
defaulting to `strict`
* Fix the `Doc.char_span` API doc alignment mode details
											
										
										
											2021-01-27 15:40:42 +03:00
+								        alignment_modes = ("strict", "contract", "expand")
 								        if alignment_mode not in alignment_modes:
 								            raise ValueError(Errors.E202.format(mode=alignment_mode, modes=", ".join(alignment_modes)))
-												Allow Doc.char_span to snap to token boundaries (#5849)

* Allow Doc.char_span to snap to token boundaries

Add a `mode` option to allow `Doc.char_span` to snap to token
boundaries. The `mode` options:

* `strict`: character offsets must match token boundaries (default, same as
before)
* `inside`: all tokens completely within the character span
* `outside`: all tokens at least partially covered by the character span

Add a new helper function `token_by_char` that returns the token
corresponding to a character position in the text. Update
`token_by_start` and `token_by_end` to use `token_by_char` for more
efficient searching.

* Remove unused import

* Rename mode to alignment_mode

Rename `mode` to `alignment_mode` with the options
`strict`/`contract`/`expand`. Any unrecognized modes are silently
converted to `strict`.
											
										
										
											2020-08-04 14:36:32 +03:00
+								        cdef int start = token_by_char(self.c, self.length, start_idx)
 								        if start < 0 or (alignment_mode == "strict" and start_idx != self[start].idx):
-												Add Doc.char_span method, to get a span by character offset

											
										
										
											2017-08-19 13:21:09 +03:00
+								            return None
-												Allow Doc.char_span to snap to token boundaries (#5849)

* Allow Doc.char_span to snap to token boundaries

Add a `mode` option to allow `Doc.char_span` to snap to token
boundaries. The `mode` options:

* `strict`: character offsets must match token boundaries (default, same as
before)
* `inside`: all tokens completely within the character span
* `outside`: all tokens at least partially covered by the character span

Add a new helper function `token_by_char` that returns the token
corresponding to a character position in the text. Update
`token_by_start` and `token_by_end` to use `token_by_char` for more
efficient searching.

* Remove unused import

* Rename mode to alignment_mode

Rename `mode` to `alignment_mode` with the options
`strict`/`contract`/`expand`. Any unrecognized modes are silently
converted to `strict`.
											
										
										
											2020-08-04 14:36:32 +03:00
+								        # end_idx is exclusive, so find the token at one char before
 								        cdef int end = token_by_char(self.c, self.length, end_idx - 1)
 								        if end < 0 or (alignment_mode == "strict" and end_idx != self[end].idx + len(self[end])):
-												Add Doc.char_span method, to get a span by character offset

											
										
										
											2017-08-19 13:21:09 +03:00
+								            return None
-												Allow Doc.char_span to snap to token boundaries (#5849)

* Allow Doc.char_span to snap to token boundaries

Add a `mode` option to allow `Doc.char_span` to snap to token
boundaries. The `mode` options:

* `strict`: character offsets must match token boundaries (default, same as
before)
* `inside`: all tokens completely within the character span
* `outside`: all tokens at least partially covered by the character span

Add a new helper function `token_by_char` that returns the token
corresponding to a character position in the text. Update
`token_by_start` and `token_by_end` to use `token_by_char` for more
efficient searching.

* Remove unused import

* Rename mode to alignment_mode

Rename `mode` to `alignment_mode` with the options
`strict`/`contract`/`expand`. Any unrecognized modes are silently
converted to `strict`.
											
										
										
											2020-08-04 14:36:32 +03:00
+								        # Adjust start and end by alignment_mode
 								        if alignment_mode == "contract":
 								            if self[start].idx < start_idx:
 								                start += 1
 								            if end_idx < self[end].idx + len(self[end]):
 								                end -= 1
 								            # if no tokens are completely within the span, return None
 								            if end < start:
 								                return None
 								        elif alignment_mode == "expand":
 								            # Don't consider the trailing whitespace to be part of the previous
 								            # token
 								            if start_idx == self[start].idx + len(self[start]):
 								                start += 1
-												Add Doc.char_span method, to get a span by character offset

											
										
										
											2017-08-19 13:21:09 +03:00
+								        # Currently we have the token index, we want the range-end index
 								        end += 1
-												annotate kb_id through ents in doc

											
										
										
											2019-03-14 17:48:40 +03:00
+								        cdef Span span = Span(self, start, end, label=label, kb_id=kb_id, vector=vector)
-												Add Doc.char_span method, to get a span by character offset

											
										
										
											2017-08-19 13:21:09 +03:00
+								        return span
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
+								    def similarity(self, other):
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
+								        """Make a semantic similarity estimate. The default estimate is cosine
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
+								        similarity using an average of word vectors.
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
+								        other (object): The object to compare with. By default, accepts `Doc`,
 								            `Span`, `Token` and `Lexeme` objects.
 								        RETURNS (float): A scalar similarity score. Higher is more similar.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
 								        DOCS: https://spacy.io/api/doc#similarity
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        if "similarity" in self.user_hooks:
 								            return self.user_hooks["similarity"](self, other)
-												Make .similarity() return 1.0 if all orth attrs match

											
										
										
											2018-01-15 18:29:48 +03:00
+								        if isinstance(other, (Lexeme, Token)) and self.length == 1:
 								            if self.c[0].lex.orth == other.orth:
 								                return 1.0
-												bugfix in span similarity (#5155) (#5358)

* bugfix in span similarity

* also rewrite doc.pyx for clarity

* formatting

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
											
										
										
											2020-04-27 17:51:27 +03:00
+								        elif isinstance(other, (Span, Doc)) and len(self) == len(other):
 								            similar = True
 								            for i in range(self.length):
 								                if self[i].orth != other[i].orth:
 								                    similar = False
 								                    break
 								            if similar:
 								                return 1.0
-												💫 Add .similarity warnings for no vectors and option to exclude warnings (#2197)

* Add logic to filter out warning IDs via environment variable

Usage: SPACY_WARNING_EXCLUDE=W001,W007

* Add warnings for empty vectors

* Add warning if no word vectors are used in .similarity methods

For example, if only tensors are available in small models – should hopefully clear up some confusion around this

* Capture warnings in tests

* Rename SPACY_WARNING_EXCLUDE to SPACY_WARNING_IGNORE

											
										
										
											2018-05-21 02:22:38 +03:00
+								        if self.vocab.vectors.n_keys == 0:
-												Simplify warnings

											
										
										
											2020-04-28 14:37:37 +03:00
+								            warnings.warn(Warnings.W007.format(obj="Doc"))
-												* Fix vectors bugs for OOV words

											
										
										
											2015-09-22 03:10:01 +03:00
+								        if self.vector_norm == 0 or other.vector_norm == 0:
-												Simplify warnings

											
										
										
											2020-04-28 14:37:37 +03:00
+								            warnings.warn(Warnings.W008.format(obj="Doc"))
-												* Fix vectors bugs for OOV words

											
										
										
											2015-09-22 03:10:01 +03:00
+								            return 0.0
-												Don't use numpy directly for similarity (#3362)

* Don't use numpy directly for similarity

* Contributor agreement

											
										
										
											2019-03-07 01:58:38 +03:00
+								        vector = self.vector
 								        xp = get_array_module(vector)
-												Tidy up and only use self.vector once

											
										
										
											2019-03-07 03:06:12 +03:00
+								        return xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def has_vector(self):
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
+								        """A boolean value indicating whether a word vector is associated with
 								        the object.
 								        RETURNS (bool): Whether a word vector is associated with the object.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
 								        DOCS: https://spacy.io/api/doc#has_vector
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        if "has_vector" in self.user_hooks:
 								            return self.user_hooks["has_vector"](self)
 								        elif self.vocab.vectors.data.size:
 								            return True
 								        elif self.tensor.size:
 								            return True
 								        else:
 								            return False
-												* Fix Issue #367: Missing has_vector property on Doc and Span objects

											
										
										
											2016-05-09 13:36:14 +03:00
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
+								    property vector:
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
+								        """A real-valued meaning representation. Defaults to an average of the
 								        token vectors.
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
+								        RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
 								            representing the document's semantics.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
 								        DOCS: https://spacy.io/api/doc#vector
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
+								        def __get__(self):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								            if "vector" in self.user_hooks:
 								                return self.user_hooks["vector"](self)
-												Try using tensor for vector/similarity methdos

											
										
										
											2017-05-31 00:35:17 +03:00
+								            if self._vector is not None:
 								                return self._vector
-												Fix similarity calculation if vectors are on GPU (#3440)


											
										
										
											2019-03-20 14:09:59 +03:00
+								            xp = get_array_module(self.vocab.vectors.data)
 								            if not len(self):
 								                self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f")
-												Fix Doc.vector for empty doc objects

											
										
										
											2017-08-22 20:52:19 +03:00
+								                return self._vector
-												Back-off to tensor for similarity if no vectors

											
										
										
											2017-11-03 22:56:33 +03:00
+								            elif self.vocab.vectors.data.size > 0:
-												Don't use numpy directly for similarity (#3362)

* Don't use numpy directly for similarity

* Contributor agreement

											
										
										
											2019-03-07 01:58:38 +03:00
+								                self._vector = sum(t.vector for t in self) / len(self)
-												Try using tensor for vector/similarity methdos

											
										
										
											2017-05-31 00:35:17 +03:00
+								                return self._vector
-												Back-off to tensor for similarity if no vectors

											
										
										
											2017-11-03 22:56:33 +03:00
+								            elif self.tensor.size > 0:
-												Try using tensor for vector/similarity methdos

											
										
										
											2017-05-31 00:35:17 +03:00
+								                self._vector = self.tensor.mean(axis=0)
 								                return self._vector
 								            else:
-												Fix similarity calculation if vectors are on GPU (#3440)


											
										
										
											2019-03-20 14:09:59 +03:00
+								                return xp.zeros((self.vocab.vectors_length,), dtype="float32")
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
-												* Try giving Doc and Span objects vector and vector_norm attributes, and .similarity functions. Turns out to be bad idea.

											
										
										
											2015-09-17 04:50:11 +03:00
+								        def __set__(self, value):
 								            self._vector = value
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
 								    property vector_norm:
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-19 00:59:44 +03:00
+								        """The L2 norm of the document's vector representation.
 								        RETURNS (float): The L2 norm of the vector representation.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
 								        DOCS: https://spacy.io/api/doc#vector_norm
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-19 00:59:44 +03:00
+								        """
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
+								        def __get__(self):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								            if "vector_norm" in self.user_hooks:
 								                return self.user_hooks["vector_norm"](self)
-												* Try giving Doc and Span objects vector and vector_norm attributes, and .similarity functions. Turns out to be bad idea.

											
										
										
											2015-09-17 04:50:11 +03:00
+								            cdef float value
-												Fix calculation of vector norm, re Issue #522. Need to consolidate the calculations into a helper function.

											
										
										
											2016-10-23 15:49:31 +03:00
+								            cdef double norm = 0
-												* Try giving Doc and Span objects vector and vector_norm attributes, and .similarity functions. Turns out to be bad idea.

											
										
										
											2015-09-17 04:50:11 +03:00
+								            if self._vector_norm is None:
-												Fix calculation of vector norm, re Issue #522. Need to consolidate the calculations into a helper function.

											
										
										
											2016-10-23 15:49:31 +03:00
+								                norm = 0.0
-												* Try giving Doc and Span objects vector and vector_norm attributes, and .similarity functions. Turns out to be bad idea.

											
										
										
											2015-09-17 04:50:11 +03:00
+								                for value in self.vector:
-												Fix calculation of vector norm, re Issue #522. Need to consolidate the calculations into a helper function.

											
										
										
											2016-10-23 15:49:31 +03:00
+								                    norm += value * value
 								                self._vector_norm = sqrt(norm) if norm != 0 else 0
-												* Try giving Doc and Span objects vector and vector_norm attributes, and .similarity functions. Turns out to be bad idea.

											
										
										
											2015-09-17 04:50:11 +03:00
+								            return self._vector_norm
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
-												* Try giving Doc and Span objects vector and vector_norm attributes, and .similarity functions. Turns out to be bad idea.

											
										
										
											2015-09-17 04:50:11 +03:00
+								        def __set__(self, value):
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
+								            self._vector_norm = value
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def text(self):
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
+								        """A unicode representation of the document text.
 								        RETURNS (unicode): The original verbatim text of the document.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return "".join(t.text_with_ws for t in self)
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def text_with_ws(self):
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
+								        """An alias of `Doc.text`, provided for duck-type compatibility with
 								        `Span` and `Token`.
 								        RETURNS (unicode): The original verbatim text of the document.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return self.text
-												* Add test and test_with_ws attributes.

											
										
										
											2015-09-13 03:27:42 +03:00
-												* Gazetteer stuff working, now need to wire up to API

											
										
										
											2015-08-06 01:35:40 +03:00
+								    property ents:
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        """The named entities in the document. Returns a tuple of named entity
 								        `Span` objects, if the entity recognizer has been applied.
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        RETURNS (tuple): Entities in the document, one `Span` per entity.
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        DOCS: https://spacy.io/api/doc#ents
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												* Gazetteer stuff working, now need to wire up to API

											
										
										
											2015-08-06 01:35:40 +03:00
+								        def __get__(self):
 								            cdef int i
 								            cdef const TokenC* token
 								            cdef int start = -1
-												Fixes for new StringStore

											
										
										
											2017-05-28 19:09:27 +03:00
+								            cdef attr_t label = 0
-												annotate kb_id through ents in doc

											
										
										
											2019-03-14 17:48:40 +03:00
+								            cdef attr_t kb_id = 0
-												* Gazetteer stuff working, now need to wire up to API

											
										
										
											2015-08-06 01:35:40 +03:00
+								            output = []
 								            for i in range(self.length):
-												* Rename Doc.data to Doc.c

											
										
										
											2015-11-03 16:15:14 +03:00
+								                token = &self.c[i]
-												* Gazetteer stuff working, now need to wire up to API

											
										
										
											2015-08-06 01:35:40 +03:00
+								                if token.ent_iob == 1:
-												Improve error message when entity sequence is inconsistent

											
										
										
											2018-03-26 08:13:34 +03:00
+								                    if start == -1:
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								                        seq = ["%s|%s" % (t.text, t.ent_iob_) for t in self[i-5:i+5]]
 								                        raise ValueError(Errors.E093.format(seq=" ".join(seq)))
-												* Gazetteer stuff working, now need to wire up to API

											
										
										
											2015-08-06 01:35:40 +03:00
+								                elif token.ent_iob == 2 or token.ent_iob == 0:
 								                    if start != -1:
-												annotate kb_id through ents in doc

											
										
										
											2019-03-14 17:48:40 +03:00
+								                        output.append(Span(self, start, i, label=label, kb_id=kb_id))
-												* Gazetteer stuff working, now need to wire up to API

											
										
										
											2015-08-06 01:35:40 +03:00
+								                    start = -1
 								                    label = 0
-												annotate kb_id through ents in doc

											
										
										
											2019-03-14 17:48:40 +03:00
+								                    kb_id = 0
-												* Gazetteer stuff working, now need to wire up to API

											
										
										
											2015-08-06 01:35:40 +03:00
+								                elif token.ent_iob == 3:
 								                    if start != -1:
-												annotate kb_id through ents in doc

											
										
										
											2019-03-14 17:48:40 +03:00
+								                        output.append(Span(self, start, i, label=label, kb_id=kb_id))
-												* Gazetteer stuff working, now need to wire up to API

											
										
										
											2015-08-06 01:35:40 +03:00
+								                    start = i
 								                    label = token.ent_type
-												annotate kb_id through ents in doc

											
										
										
											2019-03-14 17:48:40 +03:00
+								                    kb_id = token.ent_kb_id
-												* Gazetteer stuff working, now need to wire up to API

											
										
										
											2015-08-06 01:35:40 +03:00
+								            if start != -1:
-												annotate kb_id through ents in doc

											
										
										
											2019-03-14 17:48:40 +03:00
+								                output.append(Span(self, start, self.length, label=label, kb_id=kb_id))
-												* Gazetteer stuff working, now need to wire up to API

											
										
										
											2015-08-06 01:35:40 +03:00
+								            return tuple(output)
 								        def __set__(self, ents):
 								            # TODO:
-												Distinction between outside, missing and blocked NER annotations (#4307)

* remove duplicate unit test

* unit test (currently failing) for issue 4267

* bugfix: ensure doc.ents preserves kb_id annotations

* fix in setting doc.ents with empty label

* rename

* test for presetting an entity to a certain type

* allow overwriting Outside + blocking presets

* fix actions when previous label needs to be kept

* fix default ent_iob in set entities

* cleaner solution with U- action

* remove debugging print statements

* unit tests with explicit transitions and is_valid testing

* remove U- from move_names explicitly

* remove unit tests with pre-trained models that don't work

* remove (working) unit tests with pre-trained models

* clean up unit tests

* move unit tests

* small fixes

* remove two TODO's from doc.ents comments

											
										
										
											2019-09-18 22:37:17 +03:00
+								            # 1. Test basic data-driven ORTH gazetteer
 								            # 2. Test more nuanced date and currency regex
-												💫 Port master changes over to develop (#2979)

* Create aryaprabhudesai.md (#2681)

* Update _install.jade (#2688)

Typo fix: "models" -> "model"

* Add FAC to spacy.explain (resolves #2706)

* Remove docstrings for deprecated arguments (see #2703)

* When calling getoption() in conftest.py, pass a default option (#2709)

* When calling getoption() in conftest.py, pass a default option

This is necessary to allow testing an installed spacy by running:

  pytest --pyargs spacy

* Add contributor agreement

* update bengali token rules for hyphen and digits (#2731)

* Less norm computations in token similarity (#2730)

* Less norm computations in token similarity

* Contributor agreement

* Remove ')' for clarity (#2737)

Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.

* added contributor agreement for mbkupfer (#2738)

* Basic support for Telugu language (#2751)

* Lex _attrs for polish language (#2750)

* Signed spaCy contributor agreement

* Added polish version of english lex_attrs

* Introduces a bulk merge function, in order to solve issue #653 (#2696)

* Fix comment

* Introduce bulk merge to increase performance on many span merges

* Sign contributor agreement

* Implement pull request suggestions

* Describe converters more explicitly (see #2643)

* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]

* Fix formatting

* Fix dependency scheme docs (closes #2705) [ci skip]

* Don't set stop word in example (closes #2657) [ci skip]

* Add words to portuguese language _num_words (#2759)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Update Indonesian model (#2752)

* adding e-KTP in tokenizer exceptions list

* add exception token

* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception

* add tokenizer exceptions list

* combining base_norms with norm_exceptions

* adding norm_exception

* fix double key in lemmatizer

* remove unused import on punctuation.py

* reformat stop_words to reduce number of lines, improve readibility

* updating tokenizer exception

* implement is_currency for lang/id

* adding orth_first_upper in tokenizer_exceptions

* update the norm_exception list

* remove bunch of abbreviations

* adding contributors file

* Fixed spaCy+Keras example (#2763)

* bug fixes in keras example

* created contributor agreement

* Adding French hyphenated first name (#2786)

* Fix typo (closes #2784)

* Fix typo (#2795) [ci skip]

Fixed typo on line 6 "regcognizer --> recognizer"

* Adding basic support for Sinhala language. (#2788)

* adding Sinhala language package, stop words, examples and lex_attrs.

* Adding contributor agreement

* Updating contributor agreement

* Also include lowercase norm exceptions

* Fix error (#2802)

* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way.  Use the resize function

* added spaCy Contributor Agreement

* Add charlax's contributor agreement (#2805)

* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)

* Contributors agreement

* Contributors agreement

* Contributors agreement

* Add jupyter=True to displacy.render in documentation (#2806)

* Revert "Also include lowercase norm exceptions"

This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.

* Remove deprecated encoding argument to msgpack

* Set up dependency tree pattern matching skeleton (#2732)

* Fix bug when too many entity types. Fixes #2800

* Fix Python 2 test failure

* Require older msgpack-numpy

* Restore encoding arg on msgpack-numpy

* Try to fix version pin for msgpack-numpy

* Update Portuguese Language (#2790)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols

* Extended punctuation and norm_exceptions in the Portuguese language

* Correct error in spacy universe docs concerning spacy-lookup (#2814)

* Update Keras Example for (Parikh et al, 2016) implementation  (#2803)

* bug fixes in keras example

* created contributor agreement

* baseline for Parikh model

* initial version of parikh 2016 implemented

* tested asymmetric models

* fixed grevious error in normalization

* use standard SNLI test file

* begin to rework parikh example

* initial version of running example

* start to document the new version

* start to document the new version

* Update Decompositional Attention.ipynb

* fixed calls to similarity

* updated the README

* import sys package duh

* simplified indexing on mapping word to IDs

* stupid python indent error

* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround

* Fix typo (closes #2815) [ci skip]

* Update regex version dependency

* Set version to 2.0.13.dev3

* Skip seemingly problematic test

* Remove problematic test

* Try previous version of regex

* Revert "Remove problematic test"

This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.

* Unskip test

* Try older version of regex

* 💫 Update training examples and use minibatching (#2830)

<!--- Provide a general summary of your changes in the title. -->

## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.

### Types of change
enhancements

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Visual C++ link updated (#2842) (closes #2841) [ci skip]

* New landing page

* Add contribution agreement

* Correcting lang/ru/examples.py (#2845)

* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement

* Correct some grammatical inaccuracies in lang\ru\examples.py

* Move contributor agreement to separate file

* Set version to 2.0.13.dev4

* Add Persian(Farsi) language support (#2797)

* Also include lowercase norm exceptions

* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors

* Rule-based French Lemmatizer (#2818)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech 
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Set version to 2.0.13

* Fix formatting and consistency

* Update docs for new version [ci skip]

* Increment version [ci skip]

* Add info on wheels [ci skip]

* Adding "This is a sentence" example to Sinhala (#2846)

* Add wheels badge

* Update badge [ci skip]

* Update README.rst [ci skip]

* Update murmurhash pin

* Increment version to 2.0.14.dev0

* Update GPU docs for v2.0.14

* Add wheel to setup_requires

* Import prefer_gpu and require_gpu functions from Thinc

* Add tests for prefer_gpu() and require_gpu()

* Update requirements and setup.py

* Workaround bug in thinc require_gpu

* Set version to v2.0.14

* Update push-tag script

* Unhack prefer_gpu

* Require thinc 6.10.6

* Update prefer_gpu and require_gpu docs [ci skip]

* Fix specifiers for GPU

* Set version to 2.0.14.dev1

* Set version to 2.0.14

* Update Thinc version pin

* Increment version

* Fix msgpack-numpy version pin

* Increment version

* Update version to 2.0.16

* Update version [ci skip]

* Redundant ')' in the Stop words' example (#2856)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Documentation improvement regarding joblib and SO (#2867)

Some documentation improvements

## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)

### Types of change
Documentation

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* raise error when setting overlapping entities as doc.ents (#2880)

* Fix out-of-bounds access in NER training

The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!

This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.

* Change PyThaiNLP Url (#2876)

* Fix missing comma

* Add example showing a fix-up rule for space entities

* Set version to 2.0.17.dev0

* Update regex version

* Revert "Update regex version"

This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.

* Try setting older regex version, to align with conda

* Set version to 2.0.17

* Add spacy-js to universe [ci-skip]

* Add spacy-raspberry to universe (closes #2889)

* Add script to validate universe json [ci skip]

* Removed space in docs + added contributor indo (#2909)

* - removed unneeded space in documentation

* - added contributor info

* Allow input text of length up to max_length, inclusive (#2922)

* Include universe spec for spacy-wordnet component (#2919)

* feat: include universe spec for spacy-wordnet component

* chore: include spaCy contributor agreement

* Minor formatting changes [ci skip]

* Fix image [ci skip]

Twitter URL doesn't work on live site

* Check if the word is in one of the regular lists specific to each POS (#2886)

* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)

Resolves #2924.

## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)

### Types of change
bug fix

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix typo [ci skip]

* fixes symbolic link on py3 and windows (#2949)

* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948

* Update spacy/compat.py

Co-Authored-By: cicorias <cicorias@users.noreply.github.com>

* Fix formatting

* Update universe [ci skip]

* Catalan Language Support (#2940)

* Catalan language Support

* Ddding Catalan to documentation

* Sort languages alphabetically [ci skip]

* Update tests for pytest 4.x (#2965)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix regex pin to harmonize with conda (#2964)

* Update README.rst

* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)

Fixes #2976

* Fix typo

* Fix typo

* Remove duplicate file

* Require thinc 7.0.0.dev2

Fixes bug in gpu_ops that would use cupy instead of numpy on CPU

* Add missing import

* Fix error IDs

* Fix tests

											
										
										
											2018-11-29 18:30:29 +03:00
+								            tokens_in_ents = {}
 								            cdef attr_t entity_type
-												Distinction between outside, missing and blocked NER annotations (#4307)

* remove duplicate unit test

* unit test (currently failing) for issue 4267

* bugfix: ensure doc.ents preserves kb_id annotations

* fix in setting doc.ents with empty label

* rename

* test for presetting an entity to a certain type

* allow overwriting Outside + blocking presets

* fix actions when previous label needs to be kept

* fix default ent_iob in set entities

* cleaner solution with U- action

* remove debugging print statements

* unit tests with explicit transitions and is_valid testing

* remove U- from move_names explicitly

* remove unit tests with pre-trained models that don't work

* remove (working) unit tests with pre-trained models

* clean up unit tests

* move unit tests

* small fixes

* remove two TODO's from doc.ents comments

											
										
										
											2019-09-18 22:37:17 +03:00
+								            cdef attr_t kb_id
-												💫 Port master changes over to develop (#2979)

* Create aryaprabhudesai.md (#2681)

* Update _install.jade (#2688)

Typo fix: "models" -> "model"

* Add FAC to spacy.explain (resolves #2706)

* Remove docstrings for deprecated arguments (see #2703)

* When calling getoption() in conftest.py, pass a default option (#2709)

* When calling getoption() in conftest.py, pass a default option

This is necessary to allow testing an installed spacy by running:

  pytest --pyargs spacy

* Add contributor agreement

* update bengali token rules for hyphen and digits (#2731)

* Less norm computations in token similarity (#2730)

* Less norm computations in token similarity

* Contributor agreement

* Remove ')' for clarity (#2737)

Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.

* added contributor agreement for mbkupfer (#2738)

* Basic support for Telugu language (#2751)

* Lex _attrs for polish language (#2750)

* Signed spaCy contributor agreement

* Added polish version of english lex_attrs

* Introduces a bulk merge function, in order to solve issue #653 (#2696)

* Fix comment

* Introduce bulk merge to increase performance on many span merges

* Sign contributor agreement

* Implement pull request suggestions

* Describe converters more explicitly (see #2643)

* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]

* Fix formatting

* Fix dependency scheme docs (closes #2705) [ci skip]

* Don't set stop word in example (closes #2657) [ci skip]

* Add words to portuguese language _num_words (#2759)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Update Indonesian model (#2752)

* adding e-KTP in tokenizer exceptions list

* add exception token

* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception

* add tokenizer exceptions list

* combining base_norms with norm_exceptions

* adding norm_exception

* fix double key in lemmatizer

* remove unused import on punctuation.py

* reformat stop_words to reduce number of lines, improve readibility

* updating tokenizer exception

* implement is_currency for lang/id

* adding orth_first_upper in tokenizer_exceptions

* update the norm_exception list

* remove bunch of abbreviations

* adding contributors file

* Fixed spaCy+Keras example (#2763)

* bug fixes in keras example

* created contributor agreement

* Adding French hyphenated first name (#2786)

* Fix typo (closes #2784)

* Fix typo (#2795) [ci skip]

Fixed typo on line 6 "regcognizer --> recognizer"

* Adding basic support for Sinhala language. (#2788)

* adding Sinhala language package, stop words, examples and lex_attrs.

* Adding contributor agreement

* Updating contributor agreement

* Also include lowercase norm exceptions

* Fix error (#2802)

* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way.  Use the resize function

* added spaCy Contributor Agreement

* Add charlax's contributor agreement (#2805)

* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)

* Contributors agreement

* Contributors agreement

* Contributors agreement

* Add jupyter=True to displacy.render in documentation (#2806)

* Revert "Also include lowercase norm exceptions"

This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.

* Remove deprecated encoding argument to msgpack

* Set up dependency tree pattern matching skeleton (#2732)

* Fix bug when too many entity types. Fixes #2800

* Fix Python 2 test failure

* Require older msgpack-numpy

* Restore encoding arg on msgpack-numpy

* Try to fix version pin for msgpack-numpy

* Update Portuguese Language (#2790)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols

* Extended punctuation and norm_exceptions in the Portuguese language

* Correct error in spacy universe docs concerning spacy-lookup (#2814)

* Update Keras Example for (Parikh et al, 2016) implementation  (#2803)

* bug fixes in keras example

* created contributor agreement

* baseline for Parikh model

* initial version of parikh 2016 implemented

* tested asymmetric models

* fixed grevious error in normalization

* use standard SNLI test file

* begin to rework parikh example

* initial version of running example

* start to document the new version

* start to document the new version

* Update Decompositional Attention.ipynb

* fixed calls to similarity

* updated the README

* import sys package duh

* simplified indexing on mapping word to IDs

* stupid python indent error

* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround

* Fix typo (closes #2815) [ci skip]

* Update regex version dependency

* Set version to 2.0.13.dev3

* Skip seemingly problematic test

* Remove problematic test

* Try previous version of regex

* Revert "Remove problematic test"

This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.

* Unskip test

* Try older version of regex

* 💫 Update training examples and use minibatching (#2830)

<!--- Provide a general summary of your changes in the title. -->

## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.

### Types of change
enhancements

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Visual C++ link updated (#2842) (closes #2841) [ci skip]

* New landing page

* Add contribution agreement

* Correcting lang/ru/examples.py (#2845)

* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement

* Correct some grammatical inaccuracies in lang\ru\examples.py

* Move contributor agreement to separate file

* Set version to 2.0.13.dev4

* Add Persian(Farsi) language support (#2797)

* Also include lowercase norm exceptions

* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors

* Rule-based French Lemmatizer (#2818)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech 
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Set version to 2.0.13

* Fix formatting and consistency

* Update docs for new version [ci skip]

* Increment version [ci skip]

* Add info on wheels [ci skip]

* Adding "This is a sentence" example to Sinhala (#2846)

* Add wheels badge

* Update badge [ci skip]

* Update README.rst [ci skip]

* Update murmurhash pin

* Increment version to 2.0.14.dev0

* Update GPU docs for v2.0.14

* Add wheel to setup_requires

* Import prefer_gpu and require_gpu functions from Thinc

* Add tests for prefer_gpu() and require_gpu()

* Update requirements and setup.py

* Workaround bug in thinc require_gpu

* Set version to v2.0.14

* Update push-tag script

* Unhack prefer_gpu

* Require thinc 6.10.6

* Update prefer_gpu and require_gpu docs [ci skip]

* Fix specifiers for GPU

* Set version to 2.0.14.dev1

* Set version to 2.0.14

* Update Thinc version pin

* Increment version

* Fix msgpack-numpy version pin

* Increment version

* Update version to 2.0.16

* Update version [ci skip]

* Redundant ')' in the Stop words' example (#2856)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Documentation improvement regarding joblib and SO (#2867)

Some documentation improvements

## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)

### Types of change
Documentation

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* raise error when setting overlapping entities as doc.ents (#2880)

* Fix out-of-bounds access in NER training

The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!

This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.

* Change PyThaiNLP Url (#2876)

* Fix missing comma

* Add example showing a fix-up rule for space entities

* Set version to 2.0.17.dev0

* Update regex version

* Revert "Update regex version"

This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.

* Try setting older regex version, to align with conda

* Set version to 2.0.17

* Add spacy-js to universe [ci-skip]

* Add spacy-raspberry to universe (closes #2889)

* Add script to validate universe json [ci skip]

* Removed space in docs + added contributor indo (#2909)

* - removed unneeded space in documentation

* - added contributor info

* Allow input text of length up to max_length, inclusive (#2922)

* Include universe spec for spacy-wordnet component (#2919)

* feat: include universe spec for spacy-wordnet component

* chore: include spaCy contributor agreement

* Minor formatting changes [ci skip]

* Fix image [ci skip]

Twitter URL doesn't work on live site

* Check if the word is in one of the regular lists specific to each POS (#2886)

* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)

Resolves #2924.

## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)

### Types of change
bug fix

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix typo [ci skip]

* fixes symbolic link on py3 and windows (#2949)

* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948

* Update spacy/compat.py

Co-Authored-By: cicorias <cicorias@users.noreply.github.com>

* Fix formatting

* Update universe [ci skip]

* Catalan Language Support (#2940)

* Catalan language Support

* Ddding Catalan to documentation

* Sort languages alphabetically [ci skip]

* Update tests for pytest 4.x (#2965)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix regex pin to harmonize with conda (#2964)

* Update README.rst

* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)

Fixes #2976

* Fix typo

* Fix typo

* Remove duplicate file

* Require thinc 7.0.0.dev2

Fixes bug in gpu_ops that would use cupy instead of numpy on CPU

* Add missing import

* Fix error IDs

* Fix tests

											
										
										
											2018-11-29 18:30:29 +03:00
+								            cdef int ent_start, ent_end
 								            for ent_info in ents:
-												Ensure that doc.ents preserves kb_id annotations (#4294)

* bugfix: ensure doc.ents preserves kb_id annotations

* fix backward compatibility

* additional test

											
										
										
											2019-09-16 16:18:37 +03:00
+								                entity_type, kb_id, ent_start, ent_end = get_entity_info(ent_info)
-												💫 Port master changes over to develop (#2979)

* Create aryaprabhudesai.md (#2681)

* Update _install.jade (#2688)

Typo fix: "models" -> "model"

* Add FAC to spacy.explain (resolves #2706)

* Remove docstrings for deprecated arguments (see #2703)

* When calling getoption() in conftest.py, pass a default option (#2709)

* When calling getoption() in conftest.py, pass a default option

This is necessary to allow testing an installed spacy by running:

  pytest --pyargs spacy

* Add contributor agreement

* update bengali token rules for hyphen and digits (#2731)

* Less norm computations in token similarity (#2730)

* Less norm computations in token similarity

* Contributor agreement

* Remove ')' for clarity (#2737)

Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.

* added contributor agreement for mbkupfer (#2738)

* Basic support for Telugu language (#2751)

* Lex _attrs for polish language (#2750)

* Signed spaCy contributor agreement

* Added polish version of english lex_attrs

* Introduces a bulk merge function, in order to solve issue #653 (#2696)

* Fix comment

* Introduce bulk merge to increase performance on many span merges

* Sign contributor agreement

* Implement pull request suggestions

* Describe converters more explicitly (see #2643)

* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]

* Fix formatting

* Fix dependency scheme docs (closes #2705) [ci skip]

* Don't set stop word in example (closes #2657) [ci skip]

* Add words to portuguese language _num_words (#2759)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Update Indonesian model (#2752)

* adding e-KTP in tokenizer exceptions list

* add exception token

* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception

* add tokenizer exceptions list

* combining base_norms with norm_exceptions

* adding norm_exception

* fix double key in lemmatizer

* remove unused import on punctuation.py

* reformat stop_words to reduce number of lines, improve readibility

* updating tokenizer exception

* implement is_currency for lang/id

* adding orth_first_upper in tokenizer_exceptions

* update the norm_exception list

* remove bunch of abbreviations

* adding contributors file

* Fixed spaCy+Keras example (#2763)

* bug fixes in keras example

* created contributor agreement

* Adding French hyphenated first name (#2786)

* Fix typo (closes #2784)

* Fix typo (#2795) [ci skip]

Fixed typo on line 6 "regcognizer --> recognizer"

* Adding basic support for Sinhala language. (#2788)

* adding Sinhala language package, stop words, examples and lex_attrs.

* Adding contributor agreement

* Updating contributor agreement

* Also include lowercase norm exceptions

* Fix error (#2802)

* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way.  Use the resize function

* added spaCy Contributor Agreement

* Add charlax's contributor agreement (#2805)

* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)

* Contributors agreement

* Contributors agreement

* Contributors agreement

* Add jupyter=True to displacy.render in documentation (#2806)

* Revert "Also include lowercase norm exceptions"

This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.

* Remove deprecated encoding argument to msgpack

* Set up dependency tree pattern matching skeleton (#2732)

* Fix bug when too many entity types. Fixes #2800

* Fix Python 2 test failure

* Require older msgpack-numpy

* Restore encoding arg on msgpack-numpy

* Try to fix version pin for msgpack-numpy

* Update Portuguese Language (#2790)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols

* Extended punctuation and norm_exceptions in the Portuguese language

* Correct error in spacy universe docs concerning spacy-lookup (#2814)

* Update Keras Example for (Parikh et al, 2016) implementation  (#2803)

* bug fixes in keras example

* created contributor agreement

* baseline for Parikh model

* initial version of parikh 2016 implemented

* tested asymmetric models

* fixed grevious error in normalization

* use standard SNLI test file

* begin to rework parikh example

* initial version of running example

* start to document the new version

* start to document the new version

* Update Decompositional Attention.ipynb

* fixed calls to similarity

* updated the README

* import sys package duh

* simplified indexing on mapping word to IDs

* stupid python indent error

* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround

* Fix typo (closes #2815) [ci skip]

* Update regex version dependency

* Set version to 2.0.13.dev3

* Skip seemingly problematic test

* Remove problematic test

* Try previous version of regex

* Revert "Remove problematic test"

This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.

* Unskip test

* Try older version of regex

* 💫 Update training examples and use minibatching (#2830)

<!--- Provide a general summary of your changes in the title. -->

## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.

### Types of change
enhancements

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Visual C++ link updated (#2842) (closes #2841) [ci skip]

* New landing page

* Add contribution agreement

* Correcting lang/ru/examples.py (#2845)

* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement

* Correct some grammatical inaccuracies in lang\ru\examples.py

* Move contributor agreement to separate file

* Set version to 2.0.13.dev4

* Add Persian(Farsi) language support (#2797)

* Also include lowercase norm exceptions

* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors

* Rule-based French Lemmatizer (#2818)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech 
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Set version to 2.0.13

* Fix formatting and consistency

* Update docs for new version [ci skip]

* Increment version [ci skip]

* Add info on wheels [ci skip]

* Adding "This is a sentence" example to Sinhala (#2846)

* Add wheels badge

* Update badge [ci skip]

* Update README.rst [ci skip]

* Update murmurhash pin

* Increment version to 2.0.14.dev0

* Update GPU docs for v2.0.14

* Add wheel to setup_requires

* Import prefer_gpu and require_gpu functions from Thinc

* Add tests for prefer_gpu() and require_gpu()

* Update requirements and setup.py

* Workaround bug in thinc require_gpu

* Set version to v2.0.14

* Update push-tag script

* Unhack prefer_gpu

* Require thinc 6.10.6

* Update prefer_gpu and require_gpu docs [ci skip]

* Fix specifiers for GPU

* Set version to 2.0.14.dev1

* Set version to 2.0.14

* Update Thinc version pin

* Increment version

* Fix msgpack-numpy version pin

* Increment version

* Update version to 2.0.16

* Update version [ci skip]

* Redundant ')' in the Stop words' example (#2856)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Documentation improvement regarding joblib and SO (#2867)

Some documentation improvements

## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)

### Types of change
Documentation

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* raise error when setting overlapping entities as doc.ents (#2880)

* Fix out-of-bounds access in NER training

The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!

This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.

* Change PyThaiNLP Url (#2876)

* Fix missing comma

* Add example showing a fix-up rule for space entities

* Set version to 2.0.17.dev0

* Update regex version

* Revert "Update regex version"

This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.

* Try setting older regex version, to align with conda

* Set version to 2.0.17

* Add spacy-js to universe [ci-skip]

* Add spacy-raspberry to universe (closes #2889)

* Add script to validate universe json [ci skip]

* Removed space in docs + added contributor indo (#2909)

* - removed unneeded space in documentation

* - added contributor info

* Allow input text of length up to max_length, inclusive (#2922)

* Include universe spec for spacy-wordnet component (#2919)

* feat: include universe spec for spacy-wordnet component

* chore: include spaCy contributor agreement

* Minor formatting changes [ci skip]

* Fix image [ci skip]

Twitter URL doesn't work on live site

* Check if the word is in one of the regular lists specific to each POS (#2886)

* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)

Resolves #2924.

## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)

### Types of change
bug fix

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix typo [ci skip]

* fixes symbolic link on py3 and windows (#2949)

* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948

* Update spacy/compat.py

Co-Authored-By: cicorias <cicorias@users.noreply.github.com>

* Fix formatting

* Update universe [ci skip]

* Catalan Language Support (#2940)

* Catalan language Support

* Ddding Catalan to documentation

* Sort languages alphabetically [ci skip]

* Update tests for pytest 4.x (#2965)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix regex pin to harmonize with conda (#2964)

* Update README.rst

* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)

Fixes #2976

* Fix typo

* Fix typo

* Remove duplicate file

* Require thinc 7.0.0.dev2

Fixes bug in gpu_ops that would use cupy instead of numpy on CPU

* Add missing import

* Fix error IDs

* Fix tests

											
										
										
											2018-11-29 18:30:29 +03:00
+								                for token_index in range(ent_start, ent_end):
 								                    if token_index in tokens_in_ents.keys():
 								                        raise ValueError(Errors.E103.format(
 								                            span1=(tokens_in_ents[token_index][0],
 								                                   tokens_in_ents[token_index][1],
 								                                   self.vocab.strings[tokens_in_ents[token_index][2]]),
 								                            span2=(ent_start, ent_end, self.vocab.strings[entity_type])))
-												Ensure that doc.ents preserves kb_id annotations (#4294)

* bugfix: ensure doc.ents preserves kb_id annotations

* fix backward compatibility

* additional test

											
										
										
											2019-09-16 16:18:37 +03:00
+								                    tokens_in_ents[token_index] = (ent_start, ent_end, entity_type, kb_id)
-												* Gazetteer stuff working, now need to wire up to API

											
										
										
											2015-08-06 01:35:40 +03:00
+								            cdef int i
 								            for i in range(self.length):
-												Distinction between outside, missing and blocked NER annotations (#4307)

* remove duplicate unit test

* unit test (currently failing) for issue 4267

* bugfix: ensure doc.ents preserves kb_id annotations

* fix in setting doc.ents with empty label

* rename

* test for presetting an entity to a certain type

* allow overwriting Outside + blocking presets

* fix actions when previous label needs to be kept

* fix default ent_iob in set entities

* cleaner solution with U- action

* remove debugging print statements

* unit tests with explicit transitions and is_valid testing

* remove U- from move_names explicitly

* remove unit tests with pre-trained models that don't work

* remove (working) unit tests with pre-trained models

* clean up unit tests

* move unit tests

* small fixes

* remove two TODO's from doc.ents comments

											
										
										
											2019-09-18 22:37:17 +03:00
+								                # default values
 								                entity_type = 0
 								                kb_id = 0
 								                # Set ent_iob to Missing (0) bij default unless this token was nered before
 								                ent_iob = 0
 								                if self.c[i].ent_iob != 0:
 								                    ent_iob = 2
 								                # overwrite if the token was part of a specified entity
 								                if i in tokens_in_ents.keys():
 								                    ent_start, ent_end, entity_type, kb_id = tokens_in_ents[i]
 								                    if entity_type is None or entity_type <= 0:
 								                        # Blocking this token from being overwritten by downstream NER
 								                        ent_iob = 3
 								                    elif ent_start == i:
 								                        # Marking the start of an entity
 								                        ent_iob = 3
 								                    else:
 								                        # Marking the inside of an entity
 								                        ent_iob = 1
 								                self.c[i].ent_type = entity_type
 								                self.c[i].ent_kb_id = kb_id
 								                self.c[i].ent_iob = ent_iob
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def noun_chunks(self):
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
+								        """Iterate over the base noun phrases in the document. Yields base
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        noun-phrase #[code Span] objects, if the document has been
 								        syntactically parsed. A base noun phrase, or "NP chunk", is a noun
 								        phrase that does not permit other NPs to be nested within it – so no
 								        NP-level coordination, no prepositional phrases, and no relative
 								        clauses.
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
 								        YIELDS (Span): Noun chunks in the document.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
 								        DOCS: https://spacy.io/api/doc#noun_chunks
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												Limiting noun_chunks for specific languages (#5396)

* Limiting noun_chunks for specific langauges

* Limiting noun_chunks for specific languages

Contributor Agreement

* Addressing review comments

* Removed unused fixtures and imports

* Add fa_tokenizer in test suite

* Use fa_tokenizer in test

* Undo extraneous reformatting

Co-authored-by: adrianeboyd <adrianeboyd@gmail.com>
											
										
										
											2020-05-14 13:58:06 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        # Accumulate the result before beginning to iterate over it. This
 								        # prevents the tokenisation from being changed out from under us
 								        # during the iteration. The tricky thing here is that Span accepts
 								        # its tokenisation changing, so it's okay once we have the Span
 								        # objects. See Issue #375.
 								        spans = []
 								        if self.noun_chunks_iterator is not None:
 								            for start, end, label in self.noun_chunks_iterator(self):
 								                spans.append(Span(self, start, end, label=label))
 								        for span in spans:
 								            yield span
 								    @property
 								    def sents(self):
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
+								        """Iterate over the sentences in the document. Yields sentence `Span`
 								        objects. Sentence spans have no label. To improve accuracy on informal
 								        texts, spaCy calculates sentence boundaries from the syntactic
 								        dependency parse. If the parser is disabled, the `sents` iterator will
 								        be unavailable.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        YIELDS (Span): Sentences in the document.
 								        DOCS: https://spacy.io/api/doc#sents
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        if not self.is_sentenced:
 								            raise ValueError(Errors.E030)
 								        if "sents" in self.user_hooks:
 								            yield from self.user_hooks["sents"](self)
 								        else:
 								            start = 0
 								            for i in range(1, self.length):
 								                if self.c[i].sent_start == 1:
 								                    yield Span(self, start, i)
 								                    start = i
 								            if start != self.length:
 								                yield Span(self, start, self.length)
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
-												Add Doc.lang and Doc.lang_

											
										
										
											2019-03-11 16:21:40 +03:00
+								    @property
 								    def lang(self):
 								        """RETURNS (uint64): ID of the language of the doc's vocabulary."""
 								        return self.vocab.strings[self.vocab.lang]
 								    @property
 								    def lang_(self):
 								        """RETURNS (unicode): Language of the doc's vocabulary, e.g. 'en'."""
 								        return self.vocab.lang
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
-												* Refactor tokenizer, to set the 'spacy' field on TokenC instead of passing a string

											
										
										
											2015-07-13 22:46:02 +03:00
+								    cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
-												Fix Issue #599, by considering empty documents to be parsed and tagged. Implementation is a bit dodgy.

											
										
										
											2016-11-03 01:47:46 +03:00
+								        if self.length == 0:
 								            # Flip these to false when we see the first token.
 								            self.is_tagged = False
 								            self.is_parsed = False
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
+								        if self.length == self.max_length:
 								            self._realloc(self.length * 2)
-												* Rename Doc.data to Doc.c

											
										
										
											2015-11-03 16:15:14 +03:00
+								        cdef TokenC* t = &self.c[self.length]
-												* More work on language-generic parsing

											
										
										
											2015-08-28 03:02:33 +03:00
+								        if LexemeOrToken is const_TokenC_ptr:
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
+								            t[0] = lex_or_tok[0]
 								        else:
 								            t.lex = lex_or_tok
-												* Refactor tokenizer, to set the 'spacy' field on TokenC instead of passing a string

											
										
										
											2015-07-13 22:46:02 +03:00
+								        if self.length == 0:
 								            t.idx = 0
 								        else:
 								            t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy
-												* Fix L/R edge bug, by ensuring l_edge and r_edge are preset, and fixing the way the edge update in del_arc. Bugs keep arising here because the edges are absolute positions, where everything else is relative. I'm also not 100% convinced that del_arc is handled correctly. Do we need to update the parents?

											
										
										
											2015-09-09 04:39:46 +03:00
+								        t.l_edge = self.length
 								        t.r_edge = self.length
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								        if t.lex.orth == 0:
 								            raise ValueError(Errors.E031.format(i=self.length))
-												* Refactor tokenizer, to set the 'spacy' field on TokenC instead of passing a string

											
										
										
											2015-07-13 22:46:02 +03:00
+								        t.spacy = has_space
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
+								        self.length += 1
-												Make doc[0].is_sent_start == True (closes #2869) (#3340)

* Make doc[0] have sent_start True. Closes #2869

* Document that doc[0].is_sent_start defaults True.

											
										
										
											2019-02-27 13:17:17 +03:00
+								        if self.length == 1:
 								            # Set token.sent_start to 1 for first token. See issue #2869
 								            self.c[0].sent_start = 1
-												* Refactor tokenizer, to set the 'spacy' field on TokenC instead of passing a string

											
										
										
											2015-07-13 22:46:02 +03:00
+								        return t.idx + t.lex.length + t.spacy
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
 								    @cython.boundscheck(False)
 								    cpdef np.ndarray to_array(self, object py_attr_ids):
-												Support strings for attribute list in doc.to_array

											
										
										
											2017-10-19 17:07:14 +03:00
+								        """Export given token attributes to a numpy `ndarray`.
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        If `attr_ids` is a sequence of M attributes, the output array will be
 								        of shape `(N, M)`, where N is the length of the `Doc` (in tokens). If
 								        `attr_ids` is a single attribute, the output shape will be (N,). You
 								        can specify attributes by integer ID (e.g. spacy.attrs.LEMMA) or
 								        string name (e.g. 'LEMMA' or 'lemma').
-												Support strings for attribute list in doc.to_array

											
										
										
											2017-10-19 17:07:14 +03:00
 								        attr_ids (list[]): A list of attributes (int IDs or string names).
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
+								        RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
 								            per word, and one column per attribute indicated in the input
 								            `attr_ids`.
 								        EXAMPLE:
 								            >>> from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
 								            >>> doc = nlp(text)
 								            >>> # All strings mapped to integers, for easy export to numpy
 								            >>> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
+								        """
 								        cdef int i, j
 								        cdef attr_id_t feature
-												* Tests passing on round-trip pack/unpack on basic example

											
										
										
											2015-07-17 22:20:48 +03:00
+								        cdef np.ndarray[attr_t, ndim=2] output
-												Support strings for attribute list in doc.to_array

											
										
										
											2017-10-19 17:07:14 +03:00
+								        # Handle scalar/list inputs of strings/ints for py_attr_ids
-												Allow single string attributes in doc.to_array()

Previously inputs like doc.to_array('ORTH') didn't work.

Closes #3064

											
										
										
											2018-12-29 18:24:40 +03:00
+								        # See also #3064
 								        if isinstance(py_attr_ids, basestring_):
 								            # Handle inputs like doc.to_array('ORTH')
 								            py_attr_ids = [py_attr_ids]
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        elif not hasattr(py_attr_ids, "__iter__"):
-												Allow single string attributes in doc.to_array()

Previously inputs like doc.to_array('ORTH') didn't work.

Closes #3064

											
										
										
											2018-12-29 18:24:40 +03:00
+								            # Handle inputs like doc.to_array(ORTH)
-												cleanup to_array implementation using fixes on master

											
										
										
											2017-10-20 14:39:37 +03:00
+								            py_attr_ids = [py_attr_ids]
 								        # Allow strings, e.g. 'lemma' or 'LEMMA'
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
-												cleanup to_array implementation using fixes on master

											
										
										
											2017-10-20 14:39:37 +03:00
+								                       for id_ in py_attr_ids]
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								        # Make an array from the attributes --- otherwise our inner loop is
 								        # Python dict iteration.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        cdef np.ndarray attr_ids = numpy.asarray(py_attr_ids, dtype="i")
 								        output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
-												Improve efficiency of Doc.to_array

											
										
										
											2017-11-17 20:55:56 +03:00
+								        c_output = <attr_t*>output.data
 								        c_attr_ids = <attr_id_t*>attr_ids.data
 								        cdef TokenC* token
 								        cdef int nr_attr = attr_ids.shape[0]
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
+								        for i in range(self.length):
-												Improve efficiency of Doc.to_array

											
										
										
											2017-11-17 20:55:56 +03:00
+								            token = &self.c[i]
 								            for j in range(nr_attr):
 								                c_output[i*nr_attr + j] = get_token_attr(token, c_attr_ids[j])
-												cleanup to_array implementation using fixes on master

											
										
										
											2017-10-20 14:39:37 +03:00
+								        # Handle 1d case
 								        return output if len(attr_ids) >= 2 else output.reshape((self.length,))
-												counter instead of preshcounter

											
										
										
											2019-07-11 14:05:53 +03:00
+								    def count_by(self, attr_id_t attr_id, exclude=None, object counts=None):
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
+								        """Count the frequencies of a given attribute. Produces a dict of
 								        `{attribute (int): count (ints)}` frequencies, keyed by the values of
 								        the given attribute ID.
 								        attr_id (int): The attribute ID to key the counts.
 								        RETURNS (dict): A dictionary mapping attributes to integer counts.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        DOCS: https://spacy.io/api/doc#count_by
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
+								        """
 								        cdef int i
 								        cdef attr_t attr
 								        cdef size_t count
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
-												* Extend count_by method

											
										
										
											2015-07-14 04:20:09 +03:00
+								        if counts is None:
-												counter instead of preshcounter

											
										
										
											2019-07-11 14:05:53 +03:00
+								            counts = Counter()
-												* Extend count_by method

											
										
										
											2015-07-14 04:20:09 +03:00
+								            output_dict = True
 								        else:
 								            output_dict = False
 								        # Take this check out of the loop, for a bit of extra speed
 								        if exclude is None:
 								            for i in range(self.length):
-												cleanup

											
										
										
											2019-07-11 14:09:22 +03:00
+								                counts[get_token_attr(&self.c[i], attr_id)] += 1
-												* Extend count_by method

											
										
										
											2015-07-14 04:20:09 +03:00
+								        else:
 								            for i in range(self.length):
 								                if not exclude(self[i]):
-												cleanup

											
										
										
											2019-07-11 14:09:22 +03:00
+								                    counts[get_token_attr(&self.c[i], attr_id)] += 1
-												* Extend count_by method

											
										
										
											2015-07-14 04:20:09 +03:00
+								        if output_dict:
 								            return dict(counts)
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
 								    def _realloc(self, new_size):
 								        self.max_length = new_size
 								        n = new_size + (PADDING * 2)
 								        # What we're storing is a "padded" array. We've jumped forward PADDING
 								        # places, and are storing the pointer to that. This way, we can access
 								        # words out-of-bounds, and get out-of-bounds markers.
 								        # Now that we want to realloc, we need the address of the true start,
 								        # so we jump the pointer back PADDING places.
-												* Rename Doc.data to Doc.c

											
										
										
											2015-11-03 16:15:14 +03:00
+								        cdef TokenC* data_start = self.c - PADDING
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
+								        data_start = <TokenC*>self.mem.realloc(data_start, n * sizeof(TokenC))
-												* Rename Doc.data to Doc.c

											
										
										
											2015-11-03 16:15:14 +03:00
+								        self.c = data_start + PADDING
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
+								        cdef int i
 								        for i in range(self.length, self.max_length + PADDING):
-												* Rename Doc.data to Doc.c

											
										
										
											2015-11-03 16:15:14 +03:00
+								            self.c[i].lex = &EMPTY_LEXEME
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
-												* Make set_parse nogil

											
										
										
											2016-01-30 22:27:52 +03:00
+								    cdef void set_parse(self, const TokenC* parsed) nogil:
-												* draft de/serialization functions in doc.pyx

											
										
										
											2015-07-16 02:16:33 +03:00
+								        # TODO: This method is fairly misleading atm. It's used by Parser
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
+								        # to actually apply the parse calculated. Need to rethink this.
-												* Reorganize the serialization functions on Doc

											
										
										
											2015-07-22 05:53:01 +03:00
+								        # Probably we should use from_array?
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
+								        self.is_parsed = True
 								        for i in range(self.length):
-												* Rename Doc.data to Doc.c

											
										
										
											2015-11-03 16:15:14 +03:00
+								            self.c[i] = parsed[i]
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
-												Add SENT_START attribute, for custom sentence boundary detection

											
										
										
											2016-05-05 13:11:57 +03:00
+								    def from_array(self, attrs, array):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        """Load attributes from a numpy array. Write to a `Doc` object, from an
 								        `(M, N)` array of attributes.
 								        attrs (list) A list of attribute ID ints.
 								        array (numpy.ndarray[ndim=2, dtype='int32']): The attribute values.
 								        RETURNS (Doc): Itself.
 								        DOCS: https://spacy.io/api/doc#from_array
 								        """
-												Make doc.from_array() consistent with doc.to_array(). Closes #3382

											
										
										
											2019-03-10 17:50:48 +03:00
+								        # Handle scalar/list inputs of strings/ints for py_attr_ids
 								        # See also #3064
 								        if isinstance(attrs, basestring_):
 								            # Handle inputs like doc.to_array('ORTH')
 								            attrs = [attrs]
 								        elif not hasattr(attrs, "__iter__"):
 								            # Handle inputs like doc.to_array(ORTH)
 								            attrs = [attrs]
 								        # Allow strings, e.g. 'lemma' or 'LEMMA'
 								        attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
-												Handle scalar values in doc.from_array()

											
										
										
											2019-03-10 18:54:03 +03:00
+								                 for id_ in attrs]
-												Bugfix/get doc (#5049)

* new (broken) unit test

* fixing get_doc method

											
										
										
											2020-03-02 13:49:28 +03:00
+								        if array.dtype != numpy.uint64:
-												Simplify warnings

											
										
										
											2020-04-28 14:37:37 +03:00
+								            warnings.warn(Warnings.W028.format(type=array.dtype))
-												Add Doc.lang and Doc.lang_

											
										
										
											2019-03-11 16:21:40 +03:00
-												Add SENT_START attribute, for custom sentence boundary detection

											
										
										
											2016-05-05 13:11:57 +03:00
+								        if SENT_START in attrs and HEAD in attrs:
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								            raise ValueError(Errors.E032)
-												Improve token head verification (#5079)

* Improve token head verification

Improve the verification for valid token heads when heads are set:

* in `Token.head`: heads come from the same document
* in `Doc.from_array()`: head indices are within the bounds of the
document

* Improve error message

											
										
										
											2020-03-03 23:44:51 +03:00
+								        cdef int i, col, abs_head_index
-												* Reorganize the serialization functions on Doc

											
										
										
											2015-07-22 05:53:01 +03:00
+								        cdef attr_id_t attr_id
-												* Rename Doc.data to Doc.c

											
										
										
											2015-11-03 16:15:14 +03:00
+								        cdef TokenC* tokens = self.c
-												* Reorganize the serialization functions on Doc

											
										
										
											2015-07-22 05:53:01 +03:00
+								        cdef int length = len(array)
-												Fix serializer

											
										
										
											2017-05-09 19:45:18 +03:00
+								        # Get set up for fast loading
 								        cdef Pool mem = Pool()
 								        cdef int n_attrs = len(attrs)
-												prevent zero-length mem alloc (#4429)

* raise specific error when removing a matcher rule that doesn't exist

* rephrasing

* goldparse init: allocate fields only if doc is not empty

* avoid zero length alloc in saving tokenizer cache

* avoid allocating zero length mem in matcher

* asserts to avoid allocating zero length mem

* fix zero-length allocation in matcher

* bump cymem version

* revert cymem version bump

											
										
										
											2019-10-22 17:54:33 +03:00
+								        # attrs should not be empty, but make sure to avoid zero-length mem alloc
 								        assert n_attrs > 0
-												Fix serializer

											
										
										
											2017-05-09 19:45:18 +03:00
+								        attr_ids = <attr_id_t*>mem.alloc(n_attrs, sizeof(attr_id_t))
 								        for i, attr_id in enumerate(attrs):
 								            attr_ids[i] = attr_id
-												Handle scalar values in doc.from_array()

											
										
										
											2019-03-10 18:54:03 +03:00
+								        if len(array.shape) == 1:
 								            array = array.reshape((array.size, 1))
-												Improve token head verification (#5079)

* Improve token head verification

Improve the verification for valid token heads when heads are set:

* in `Token.head`: heads come from the same document
* in `Doc.from_array()`: head indices are within the bounds of the
document

* Improve error message

											
										
										
											2020-03-03 23:44:51 +03:00
+								        # Check that all heads are within the document bounds
 								        if HEAD in attrs:
 								            col = attrs.index(HEAD)
 								            for i in range(length):
 								                # cast index to signed int
 								                abs_head_index = numpy.int32(array[i, col]) + i
 								                if abs_head_index < 0 or abs_head_index >= length:
 								                    raise ValueError(Errors.E190.format(index=i, value=array[i, col], rel_head_index=numpy.int32(array[i, col])))
-												💫 Fix interaction of lemmatizer and tokenizer exceptions (#3388)

Closes #2203. Closes #3268.

Lemmas set from outside the `Morphology` class were being overwritten. The result was especially confusing when deserialising, as it meant some lemmas could change when storing and retrieving a `Doc` object.

This PR applies two fixes:

1) When we go to set the lemma in the `Morphology` class, first check whether a lemma is already set. If so, don't overwrite.
2) When we load with `doc.from_array()`, take care to apply the `TAG` field first. This allows other fields to overwrite the `TAG` implied properties, if they're provided explicitly (e.g. the `LEMMA`).

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-11 03:31:21 +03:00
+								        # Do TAG first. This lets subsequent loop override stuff like POS, LEMMA
 								        if TAG in attrs:
 								            col = attrs.index(TAG)
 								            for i in range(length):
 								                if array[i, col] != 0:
 								                    self.vocab.morphology.assign_tag(&tokens[i], array[i, col])
-												Fix serializer

											
										
										
											2017-05-09 19:45:18 +03:00
+								        # Now load the data
-												Fix dependency copy for as_doc (#3969)

* failing unit test for issue 3962

* attempt to fix Issue #3962

* create artificial unit test example

* using length instead of self.length

* sp

* reformat with black

* find better ancestor within span and use generic 'dep'

* attach to span.root if there is no appropriate ancestor

* comment span text

* clean up ancestor code

* reconstruct dep tree to keep same number of sentences

											
										
										
											2019-07-23 19:28:55 +03:00
+								        for i in range(length):
-												Fix serializer

											
										
										
											2017-05-09 19:45:18 +03:00
+								            token = &self.c[i]
 								            for j in range(n_attrs):
-												💫 Fix interaction of lemmatizer and tokenizer exceptions (#3388)

Closes #2203. Closes #3268.

Lemmas set from outside the `Morphology` class were being overwritten. The result was especially confusing when deserialising, as it meant some lemmas could change when storing and retrieving a `Doc` object.

This PR applies two fixes:

1) When we go to set the lemma in the `Morphology` class, first check whether a lemma is already set. If so, don't overwrite.
2) When we load with `doc.from_array()`, take care to apply the `TAG` field first. This allows other fields to overwrite the `TAG` implied properties, if they're provided explicitly (e.g. the `LEMMA`).

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-11 03:31:21 +03:00
+								                if attr_ids[j] != TAG:
 								                    Token.set_struct_attr(token, attr_ids[j], array[i, j])
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        # Set flags
-												Require HEAD for is_parsed in Doc.from_array() (#5011)

Modify flag settings so that `DEP` is not sufficient to set `is_parsed`
and only run `set_children_from_heads()` if `HEAD` is provided.

Then the combination `[SENT_START, DEP]` will set deps and not clobber
sent starts with a lot of one-word sentences.
											
										
										
											2020-02-16 19:17:09 +03:00
+								        self.is_parsed = bool(self.is_parsed or HEAD in attrs)
-												Fix clobber of doc.is_tagged in doc.from_array()

If doc.from_array() was called with say, only entity information, this
would cause doc.is_tagged to be set to False, even if tags were set.
This caused tags to be dropped from serialisation. The same was true for
doc.is_parsed.

Closes #3012.

											
										
										
											2018-12-30 17:48:10 +03:00
+								        self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs)
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        # If document is parsed, set children
-												fix sent_start in serialization

											
										
										
											2018-01-28 21:50:42 +03:00
+								        if self.is_parsed:
-												Fix dependency copy for as_doc (#3969)

* failing unit test for issue 3962

* attempt to fix Issue #3962

* create artificial unit test example

* using length instead of self.length

* sp

* reformat with black

* find better ancestor within span and use generic 'dep'

* attach to span.root if there is no appropriate ancestor

* comment span text

* clean up ancestor code

* reconstruct dep tree to keep same number of sentences

											
										
										
											2019-07-23 19:28:55 +03:00
+								            set_children_from_heads(self.c, length)
-												* Reorganize the serialization functions on Doc

											
										
										
											2015-07-22 05:53:01 +03:00
+								        return self
-												Add LCA matrix for spans and docs

											
										
										
											2017-10-20 21:28:00 +03:00
+								    def get_lca_matrix(self):
-												Fix issue 2396 (#3089)

* Test on #2396: bug in Doc.get_lca_matrix()

* reimplementation of Doc.get_lca_matrix(), (closes #2396)

* reimplement Span.get_lca_matrix(), and call it from Doc.get_lca_matrix()

* tests Span.get_lca_matrix() as well as Doc.get_lca_matrix()

* implement _get_lca_matrix as a helper function in doc.pyx; call it from Doc.get_lca_matrix and Span.get_lca_matrix

* use memory view instead of np.ndarray in _get_lca_matrix (faster)

* fix bug when calling Span.get_lca_matrix; return lca matrix as np.array instead of memoryview

* cleaner conditional, add comment

											
										
										
											2018-12-29 20:02:26 +03:00
+								        """Calculates a matrix of Lowest Common Ancestors (LCA) for a given
 								        `Doc`, where LCA[i, j] is the index of the lowest common ancestor among
 								        token i and j.
 								        RETURNS (np.array[ndim=2, dtype=numpy.int32]): LCA matrix with shape
 								            (n, n), where n = len(self).
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
 								        DOCS: https://spacy.io/api/doc#get_lca_matrix
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """
-												Fix issue 2396 (#3089)

* Test on #2396: bug in Doc.get_lca_matrix()

* reimplementation of Doc.get_lca_matrix(), (closes #2396)

* reimplement Span.get_lca_matrix(), and call it from Doc.get_lca_matrix()

* tests Span.get_lca_matrix() as well as Doc.get_lca_matrix()

* implement _get_lca_matrix as a helper function in doc.pyx; call it from Doc.get_lca_matrix and Span.get_lca_matrix

* use memory view instead of np.ndarray in _get_lca_matrix (faster)

* fix bug when calling Span.get_lca_matrix; return lca matrix as np.array instead of memoryview

* cleaner conditional, add comment

											
										
										
											2018-12-29 20:02:26 +03:00
+								        return numpy.asarray(_get_lca_matrix(self, 0, len(self)))
-												Add LCA matrix for spans and docs

											
										
										
											2017-10-20 21:28:00 +03:00
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								    def to_disk(self, path, **kwargs):
-												Add Doc.to_disk() and Doc.from_disk() methods

											
										
										
											2017-05-24 12:58:17 +03:00
+								        """Save the current state to a directory.
 								        path (unicode or Path): A path to a directory, which will be created if
-												Tidy up Doc, Token and Span and add missing docs

											
										
										
											2017-10-27 18:07:26 +03:00
+								            it doesn't exist. Paths may be either strings or Path-like objects.
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								        exclude (list): String names of serialization fields to exclude.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
 								        DOCS: https://spacy.io/api/doc#to_disk
-												Add Doc.to_disk() and Doc.from_disk() methods

											
										
										
											2017-05-24 12:58:17 +03:00
+								        """
-												Ensure path in Doc.to_disk/from_disk (resolves ##1521)

Also add Doc serialization tests with both Path and string path options

											
										
										
											2017-11-09 04:29:03 +03:00
+								        path = util.ensure_path(path)
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        with path.open("wb") as file_:
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								            file_.write(self.to_bytes(**kwargs))
-												Add Doc.to_disk() and Doc.from_disk() methods

											
										
										
											2017-05-24 12:58:17 +03:00
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								    def from_disk(self, path, **kwargs):
-												Add Doc.to_disk() and Doc.from_disk() methods

											
										
										
											2017-05-24 12:58:17 +03:00
+								        """Loads state from a directory. Modifies the object in place and
 								        returns it.
 								        path (unicode or Path): A path to a directory. Paths may be either
 								            strings or `Path`-like objects.
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								        exclude (list): String names of serialization fields to exclude.
-												Add Doc.to_disk() and Doc.from_disk() methods

											
										
										
											2017-05-24 12:58:17 +03:00
+								        RETURNS (Doc): The modified `Doc` object.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
 								        DOCS: https://spacy.io/api/doc#from_disk
-												Add Doc.to_disk() and Doc.from_disk() methods

											
										
										
											2017-05-24 12:58:17 +03:00
+								        """
-												Ensure path in Doc.to_disk/from_disk (resolves ##1521)

Also add Doc serialization tests with both Path and string path options

											
										
										
											2017-11-09 04:29:03 +03:00
+								        path = util.ensure_path(path)
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        with path.open("rb") as file_:
-												Try using tensor for vector/similarity methdos

											
										
										
											2017-05-31 00:35:17 +03:00
+								            bytes_data = file_.read()
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								        return self.from_bytes(bytes_data, **kwargs)
-												Add Doc.to_disk() and Doc.from_disk() methods

											
										
										
											2017-05-24 12:58:17 +03:00
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								    def to_bytes(self, exclude=tuple(), **kwargs):
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
+								        """Serialize, i.e. export the document contents to a binary string.
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								        exclude (list): String names of serialization fields to exclude.
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
+								        RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
 								            all annotations.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
 								        DOCS: https://spacy.io/api/doc#to_bytes
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												Add strings and ENT_KB_ID to Doc serialization (#5691)

* Add strings for all writeable Token attributes to `Doc.to/from_bytes()`.
* Add ENT_KB_ID to default attributes.
											
										
										
											2020-07-02 18:11:57 +03:00
+								        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, NORM, ENT_KB_ID]
-												fix sent_start in serialization

											
										
										
											2018-01-28 21:50:42 +03:00
+								        if self.is_tagged:
-												Serialize POS attribute when doc.is_tagged (#4092)

* fix and unit test for issue 3959

* additional unit test for manifestation of the same (resolved) bug

											
										
										
											2019-08-21 22:59:30 +03:00
+								            array_head.extend([TAG, POS])
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        # If doc parsed add head and dep attribute
-												fix sent_start in serialization

											
										
										
											2018-01-28 21:50:42 +03:00
+								        if self.is_parsed:
 								            array_head.extend([HEAD, DEP])
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        # Otherwise add sent_start
-												fix sent_start in serialization

											
										
										
											2018-01-28 21:50:42 +03:00
+								        else:
 								            array_head.append(SENT_START)
-												Add strings and ENT_KB_ID to Doc serialization (#5691)

* Add strings for all writeable Token attributes to `Doc.to/from_bytes()`.
* Add ENT_KB_ID to default attributes.
											
										
										
											2020-07-02 18:11:57 +03:00
+								        strings = set()
 								        for token in self:
 								            strings.add(token.tag_)
 								            strings.add(token.lemma_)
 								            strings.add(token.dep_)
 								            strings.add(token.ent_type_)
 								            strings.add(token.ent_kb_id_)
-												Add ent_id_ to strings serialized with Doc (#6353)


											
										
										
											2020-11-10 15:16:07 +03:00
+								            strings.add(token.ent_id_)
-												Add strings and ENT_KB_ID to Doc serialization (#5691)

* Add strings for all writeable Token attributes to `Doc.to/from_bytes()`.
* Add ENT_KB_ID to default attributes.
											
										
										
											2020-07-02 18:11:57 +03:00
+								            strings.add(token.norm_)
-												Improve deserialization of user_data, esp. for Underscore

											
										
										
											2017-10-17 20:29:20 +03:00
+								        # Msgpack doesn't distinguish between lists and tuples, which is
 								        # vexing for user data. As a best guess, we *know* that within
 								        # keys, we must have tuples. In values we just have to hope
 								        # users don't mind getting a list instead of a tuple.
-												Try using tensor for vector/similarity methdos

											
										
										
											2017-05-31 00:35:17 +03:00
+								        serializers = {
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								            "text": lambda: self.text,
 								            "array_head": lambda: array_head,
 								            "array_body": lambda: self.to_array(array_head),
 								            "sentiment": lambda: self.sentiment,
 								            "tensor": lambda: self.tensor,
-												Include Doc.cats in serialization of Doc and DocBin (#4774)

* Include Doc.cats in to_bytes()

* Include Doc.cats in DocBin serialization

* Add tests for serialization of cats

Test serialization of cats for Doc and DocBin.

											
										
										
											2019-12-06 16:07:39 +03:00
+								            "cats": lambda: self.cats,
-												Add strings and ENT_KB_ID to Doc serialization (#5691)

* Add strings for all writeable Token attributes to `Doc.to/from_bytes()`.
* Add ENT_KB_ID to default attributes.
											
										
										
											2020-07-02 18:11:57 +03:00
+								            "strings": lambda: list(strings),
-												Try using tensor for vector/similarity methdos

											
										
										
											2017-05-31 00:35:17 +03:00
+								        }
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								        for key in kwargs:
 								            if key in serializers or key in ("user_data", "user_data_keys", "user_data_values"):
 								                raise ValueError(Errors.E128.format(arg=key))
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        if "user_data" not in exclude and self.user_data:
-												Improve deserialization of user_data, esp. for Underscore

											
										
										
											2017-10-17 20:29:20 +03:00
+								            user_data_keys, user_data_values = list(zip(*self.user_data.items()))
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								            if "user_data_keys" not in exclude:
 								                serializers["user_data_keys"] = lambda: srsly.msgpack_dumps(user_data_keys)
 								            if "user_data_values" not in exclude:
 								                serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values)
-												Try using tensor for vector/similarity methdos

											
										
										
											2017-05-31 00:35:17 +03:00
+								        return util.to_bytes(serializers, exclude)
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								    def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
+								        """Deserialize, i.e. import the document contents from a binary string.
 								        data (bytes): The string to load from.
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								        exclude (list): String names of serialization fields to exclude.
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
+								        RETURNS (Doc): Itself.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
 								        DOCS: https://spacy.io/api/doc#from_bytes
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												Implement Doc.to_bytes and Doc.from_bytes methods

											
										
										
											2017-05-09 19:11:34 +03:00
+								        if self.length != 0:
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								            raise ValueError(Errors.E033.format(length=self.length))
-												Try using tensor for vector/similarity methdos

											
										
										
											2017-05-31 00:35:17 +03:00
+								        deserializers = {
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								            "text": lambda b: None,
 								            "array_head": lambda b: None,
 								            "array_body": lambda b: None,
 								            "sentiment": lambda b: None,
 								            "tensor": lambda b: None,
-												Include Doc.cats in serialization of Doc and DocBin (#4774)

* Include Doc.cats in to_bytes()

* Include Doc.cats in DocBin serialization

* Add tests for serialization of cats

Test serialization of cats for Doc and DocBin.

											
										
										
											2019-12-06 16:07:39 +03:00
+								            "cats": lambda b: None,
-												Add strings and ENT_KB_ID to Doc serialization (#5691)

* Add strings for all writeable Token attributes to `Doc.to/from_bytes()`.
* Add ENT_KB_ID to default attributes.
											
										
										
											2020-07-02 18:11:57 +03:00
+								            "strings": lambda b: None,
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								            "user_data_keys": lambda b: None,
 								            "user_data_values": lambda b: None,
-												Try using tensor for vector/similarity methdos

											
										
										
											2017-05-31 00:35:17 +03:00
+								        }
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								        for key in kwargs:
 								            if key in deserializers or key in ("user_data",):
 								                raise ValueError(Errors.E128.format(arg=key))
-												Try using tensor for vector/similarity methdos

											
										
										
											2017-05-31 00:35:17 +03:00
+								        msg = util.from_bytes(bytes_data, deserializers, exclude)
-												Improve deserialization of user_data, esp. for Underscore

											
										
										
											2017-10-17 20:29:20 +03:00
+								        # Msgpack doesn't distinguish between lists and tuples, which is
 								        # vexing for user data. As a best guess, we *know* that within
 								        # keys, we must have tuples. In values we just have to hope
 								        # users don't mind getting a list instead of a tuple.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        if "user_data" not in exclude and "user_data_keys" in msg:
 								            user_data_keys = srsly.msgpack_loads(msg["user_data_keys"], use_list=False)
 								            user_data_values = srsly.msgpack_loads(msg["user_data_values"])
-												Improve deserialization of user_data, esp. for Underscore

											
										
										
											2017-10-17 20:29:20 +03:00
+								            for key, value in zip(user_data_keys, user_data_values):
 								                self.user_data[key] = value
-												Implement Doc.to_bytes and Doc.from_bytes methods

											
										
										
											2017-05-09 19:11:34 +03:00
+								        cdef int i, start, end, has_space
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        if "sentiment" not in exclude and "sentiment" in msg:
 								            self.sentiment = msg["sentiment"]
 								        if "tensor" not in exclude and "tensor" in msg:
 								            self.tensor = msg["tensor"]
-												Include Doc.cats in serialization of Doc and DocBin (#4774)

* Include Doc.cats in to_bytes()

* Include Doc.cats in DocBin serialization

* Add tests for serialization of cats

Test serialization of cats for Doc and DocBin.

											
										
										
											2019-12-06 16:07:39 +03:00
+								        if "cats" not in exclude and "cats" in msg:
 								            self.cats = msg["cats"]
-												Add strings and ENT_KB_ID to Doc serialization (#5691)

* Add strings for all writeable Token attributes to `Doc.to/from_bytes()`.
* Add ENT_KB_ID to default attributes.
											
										
										
											2020-07-02 18:11:57 +03:00
+								        if "strings" not in exclude and "strings" in msg:
 								            for s in msg["strings"]:
 								                self.vocab.strings.add(s)
-												Implement Doc.to_bytes and Doc.from_bytes methods

											
										
										
											2017-05-09 19:11:34 +03:00
+								        start = 0
 								        cdef const LexemeC* lex
 								        cdef unicode orth_
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        text = msg["text"]
 								        attrs = msg["array_body"]
-												Implement Doc.to_bytes and Doc.from_bytes methods

											
										
										
											2017-05-09 19:11:34 +03:00
+								        for i in range(attrs.shape[0]):
 								            end = start + attrs[i, 0]
 								            has_space = attrs[i, 1]
 								            orth_ = text[start:end]
 								            lex = self.vocab.get(self.mem, orth_)
 								            self.push_back(lex, has_space)
 								            start = end + has_space
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        self.from_array(msg["array_head"][2:], attrs[:, 2:])
-												Fix serializer

											
										
										
											2017-05-09 19:45:18 +03:00
+								        return self
-												* Reorganize the serialization functions on Doc

											
										
										
											2015-07-22 05:53:01 +03:00
-												Add Doc.extend_tensor() method

											
										
										
											2017-11-03 13:20:31 +03:00
+								    def extend_tensor(self, tensor):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        """Concatenate a new tensor onto the doc.tensor object.
-												Add Doc.extend_tensor() method

											
										
										
											2017-11-03 13:20:31 +03:00
 								        The doc.tensor attribute holds dense feature vectors
 								        computed by the models in the pipeline. Let's say a
 								        document with 30 words has a tensor with 128 dimensions
 								        per word. doc.tensor.shape will be (30, 128). After
-												Fixed typos for #2222,#2223 (#2233) (closes #2222, closes #2223)


											
										
										
											2018-04-19 00:55:26 +03:00
+								        calling doc.extend_tensor with an array of shape (30, 64),
-												Add Doc.extend_tensor() method

											
										
										
											2017-11-03 13:20:31 +03:00
+								        doc.tensor == (30, 192).
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        """
-												Add Doc.extend_tensor() method

											
										
										
											2017-11-03 13:20:31 +03:00
+								        xp = get_array_module(self.tensor)
 								        if self.tensor.size == 0:
-												💫 Port master changes over to develop (#2979)

* Create aryaprabhudesai.md (#2681)

* Update _install.jade (#2688)

Typo fix: "models" -> "model"

* Add FAC to spacy.explain (resolves #2706)

* Remove docstrings for deprecated arguments (see #2703)

* When calling getoption() in conftest.py, pass a default option (#2709)

* When calling getoption() in conftest.py, pass a default option

This is necessary to allow testing an installed spacy by running:

  pytest --pyargs spacy

* Add contributor agreement

* update bengali token rules for hyphen and digits (#2731)

* Less norm computations in token similarity (#2730)

* Less norm computations in token similarity

* Contributor agreement

* Remove ')' for clarity (#2737)

Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.

* added contributor agreement for mbkupfer (#2738)

* Basic support for Telugu language (#2751)

* Lex _attrs for polish language (#2750)

* Signed spaCy contributor agreement

* Added polish version of english lex_attrs

* Introduces a bulk merge function, in order to solve issue #653 (#2696)

* Fix comment

* Introduce bulk merge to increase performance on many span merges

* Sign contributor agreement

* Implement pull request suggestions

* Describe converters more explicitly (see #2643)

* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]

* Fix formatting

* Fix dependency scheme docs (closes #2705) [ci skip]

* Don't set stop word in example (closes #2657) [ci skip]

* Add words to portuguese language _num_words (#2759)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Update Indonesian model (#2752)

* adding e-KTP in tokenizer exceptions list

* add exception token

* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception

* add tokenizer exceptions list

* combining base_norms with norm_exceptions

* adding norm_exception

* fix double key in lemmatizer

* remove unused import on punctuation.py

* reformat stop_words to reduce number of lines, improve readibility

* updating tokenizer exception

* implement is_currency for lang/id

* adding orth_first_upper in tokenizer_exceptions

* update the norm_exception list

* remove bunch of abbreviations

* adding contributors file

* Fixed spaCy+Keras example (#2763)

* bug fixes in keras example

* created contributor agreement

* Adding French hyphenated first name (#2786)

* Fix typo (closes #2784)

* Fix typo (#2795) [ci skip]

Fixed typo on line 6 "regcognizer --> recognizer"

* Adding basic support for Sinhala language. (#2788)

* adding Sinhala language package, stop words, examples and lex_attrs.

* Adding contributor agreement

* Updating contributor agreement

* Also include lowercase norm exceptions

* Fix error (#2802)

* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way.  Use the resize function

* added spaCy Contributor Agreement

* Add charlax's contributor agreement (#2805)

* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)

* Contributors agreement

* Contributors agreement

* Contributors agreement

* Add jupyter=True to displacy.render in documentation (#2806)

* Revert "Also include lowercase norm exceptions"

This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.

* Remove deprecated encoding argument to msgpack

* Set up dependency tree pattern matching skeleton (#2732)

* Fix bug when too many entity types. Fixes #2800

* Fix Python 2 test failure

* Require older msgpack-numpy

* Restore encoding arg on msgpack-numpy

* Try to fix version pin for msgpack-numpy

* Update Portuguese Language (#2790)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols

* Extended punctuation and norm_exceptions in the Portuguese language

* Correct error in spacy universe docs concerning spacy-lookup (#2814)

* Update Keras Example for (Parikh et al, 2016) implementation  (#2803)

* bug fixes in keras example

* created contributor agreement

* baseline for Parikh model

* initial version of parikh 2016 implemented

* tested asymmetric models

* fixed grevious error in normalization

* use standard SNLI test file

* begin to rework parikh example

* initial version of running example

* start to document the new version

* start to document the new version

* Update Decompositional Attention.ipynb

* fixed calls to similarity

* updated the README

* import sys package duh

* simplified indexing on mapping word to IDs

* stupid python indent error

* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround

* Fix typo (closes #2815) [ci skip]

* Update regex version dependency

* Set version to 2.0.13.dev3

* Skip seemingly problematic test

* Remove problematic test

* Try previous version of regex

* Revert "Remove problematic test"

This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.

* Unskip test

* Try older version of regex

* 💫 Update training examples and use minibatching (#2830)

<!--- Provide a general summary of your changes in the title. -->

## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.

### Types of change
enhancements

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Visual C++ link updated (#2842) (closes #2841) [ci skip]

* New landing page

* Add contribution agreement

* Correcting lang/ru/examples.py (#2845)

* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement

* Correct some grammatical inaccuracies in lang\ru\examples.py

* Move contributor agreement to separate file

* Set version to 2.0.13.dev4

* Add Persian(Farsi) language support (#2797)

* Also include lowercase norm exceptions

* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors

* Rule-based French Lemmatizer (#2818)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech 
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Set version to 2.0.13

* Fix formatting and consistency

* Update docs for new version [ci skip]

* Increment version [ci skip]

* Add info on wheels [ci skip]

* Adding "This is a sentence" example to Sinhala (#2846)

* Add wheels badge

* Update badge [ci skip]

* Update README.rst [ci skip]

* Update murmurhash pin

* Increment version to 2.0.14.dev0

* Update GPU docs for v2.0.14

* Add wheel to setup_requires

* Import prefer_gpu and require_gpu functions from Thinc

* Add tests for prefer_gpu() and require_gpu()

* Update requirements and setup.py

* Workaround bug in thinc require_gpu

* Set version to v2.0.14

* Update push-tag script

* Unhack prefer_gpu

* Require thinc 6.10.6

* Update prefer_gpu and require_gpu docs [ci skip]

* Fix specifiers for GPU

* Set version to 2.0.14.dev1

* Set version to 2.0.14

* Update Thinc version pin

* Increment version

* Fix msgpack-numpy version pin

* Increment version

* Update version to 2.0.16

* Update version [ci skip]

* Redundant ')' in the Stop words' example (#2856)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Documentation improvement regarding joblib and SO (#2867)

Some documentation improvements

## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)

### Types of change
Documentation

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* raise error when setting overlapping entities as doc.ents (#2880)

* Fix out-of-bounds access in NER training

The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!

This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.

* Change PyThaiNLP Url (#2876)

* Fix missing comma

* Add example showing a fix-up rule for space entities

* Set version to 2.0.17.dev0

* Update regex version

* Revert "Update regex version"

This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.

* Try setting older regex version, to align with conda

* Set version to 2.0.17

* Add spacy-js to universe [ci-skip]

* Add spacy-raspberry to universe (closes #2889)

* Add script to validate universe json [ci skip]

* Removed space in docs + added contributor indo (#2909)

* - removed unneeded space in documentation

* - added contributor info

* Allow input text of length up to max_length, inclusive (#2922)

* Include universe spec for spacy-wordnet component (#2919)

* feat: include universe spec for spacy-wordnet component

* chore: include spaCy contributor agreement

* Minor formatting changes [ci skip]

* Fix image [ci skip]

Twitter URL doesn't work on live site

* Check if the word is in one of the regular lists specific to each POS (#2886)

* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)

Resolves #2924.

## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)

### Types of change
bug fix

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix typo [ci skip]

* fixes symbolic link on py3 and windows (#2949)

* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948

* Update spacy/compat.py

Co-Authored-By: cicorias <cicorias@users.noreply.github.com>

* Fix formatting

* Update universe [ci skip]

* Catalan Language Support (#2940)

* Catalan language Support

* Ddding Catalan to documentation

* Sort languages alphabetically [ci skip]

* Update tests for pytest 4.x (#2965)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix regex pin to harmonize with conda (#2964)

* Update README.rst

* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)

Fixes #2976

* Fix typo

* Fix typo

* Remove duplicate file

* Require thinc 7.0.0.dev2

Fixes bug in gpu_ops that would use cupy instead of numpy on CPU

* Add missing import

* Fix error IDs

* Fix tests

											
										
										
											2018-11-29 18:30:29 +03:00
+								            self.tensor.resize(tensor.shape, refcheck=False)
-												Add Doc.extend_tensor() method

											
										
										
											2017-11-03 13:20:31 +03:00
+								            copy_array(self.tensor, tensor)
 								        else:
 								            self.tensor = xp.hstack((self.tensor, tensor))
-												Add doc.retokenize() context manager (#2172)

This patch takes a step towards #1487 by introducing the
doc.retokenize() context manager, to handle merging spans, and soon
splitting tokens.

The idea is to do merging and splitting like this:

with doc.retokenize() as retokenizer:
    for start, end, label in matches:
        retokenizer.merge(doc[start : end], attrs={'ent_type': label})

The retokenizer accumulates the merge requests, and applies them
together at the end of the block. This will allow retokenization to be
more efficient, and much less error prone.

A retokenizer.split() function will then be added, to handle splitting a
single token into multiple tokens. These methods take `Span` and `Token`
objects; if the user wants to go directly from offsets, they can append
to the .merges and .splits lists on the retokenizer.

The doc.merge() method's behaviour remains unchanged, so this patch
should be 100% backwards incompatible (modulo bugs). Internally,
doc.merge() fixes up the arguments (to handle the various deprecated styles),
opens the retokenizer, and makes the single merge.

We can later start making deprecation warnings on direct calls to doc.merge(),
to migrate people to use of the retokenize context manager.
											
										
										
											2018-04-03 15:10:35 +03:00
+								    def retokenize(self):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        """Context manager to handle retokenization of the Doc.
-												Add doc.retokenize() context manager (#2172)

This patch takes a step towards #1487 by introducing the
doc.retokenize() context manager, to handle merging spans, and soon
splitting tokens.

The idea is to do merging and splitting like this:

with doc.retokenize() as retokenizer:
    for start, end, label in matches:
        retokenizer.merge(doc[start : end], attrs={'ent_type': label})

The retokenizer accumulates the merge requests, and applies them
together at the end of the block. This will allow retokenization to be
more efficient, and much less error prone.

A retokenizer.split() function will then be added, to handle splitting a
single token into multiple tokens. These methods take `Span` and `Token`
objects; if the user wants to go directly from offsets, they can append
to the .merges and .splits lists on the retokenizer.

The doc.merge() method's behaviour remains unchanged, so this patch
should be 100% backwards incompatible (modulo bugs). Internally,
doc.merge() fixes up the arguments (to handle the various deprecated styles),
opens the retokenizer, and makes the single merge.

We can later start making deprecation warnings on direct calls to doc.merge(),
to migrate people to use of the retokenize context manager.
											
										
										
											2018-04-03 15:10:35 +03:00
+								        Modifications to the Doc's tokenization are stored, and then
 								        made all at once when the context manager exits. This is
 								        much more efficient, and less error-prone.
 								        All views of the Doc (Span and Token) created before the
 								        retokenization are invalidated, although they may accidentally
 								        continue to work.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
 								        DOCS: https://spacy.io/api/doc#retokenize
 								        USAGE: https://spacy.io/usage/linguistic-features#retokenization
 								        """
-												Add doc.retokenize() context manager (#2172)

This patch takes a step towards #1487 by introducing the
doc.retokenize() context manager, to handle merging spans, and soon
splitting tokens.

The idea is to do merging and splitting like this:

with doc.retokenize() as retokenizer:
    for start, end, label in matches:
        retokenizer.merge(doc[start : end], attrs={'ent_type': label})

The retokenizer accumulates the merge requests, and applies them
together at the end of the block. This will allow retokenization to be
more efficient, and much less error prone.

A retokenizer.split() function will then be added, to handle splitting a
single token into multiple tokens. These methods take `Span` and `Token`
objects; if the user wants to go directly from offsets, they can append
to the .merges and .splits lists on the retokenizer.

The doc.merge() method's behaviour remains unchanged, so this patch
should be 100% backwards incompatible (modulo bugs). Internally,
doc.merge() fixes up the arguments (to handle the various deprecated styles),
opens the retokenizer, and makes the single merge.

We can later start making deprecation warnings on direct calls to doc.merge(),
to migrate people to use of the retokenize context manager.
											
										
										
											2018-04-03 15:10:35 +03:00
+								        return Retokenizer(self)
-												💫 Port master changes over to develop (#2979)

* Create aryaprabhudesai.md (#2681)

* Update _install.jade (#2688)

Typo fix: "models" -> "model"

* Add FAC to spacy.explain (resolves #2706)

* Remove docstrings for deprecated arguments (see #2703)

* When calling getoption() in conftest.py, pass a default option (#2709)

* When calling getoption() in conftest.py, pass a default option

This is necessary to allow testing an installed spacy by running:

  pytest --pyargs spacy

* Add contributor agreement

* update bengali token rules for hyphen and digits (#2731)

* Less norm computations in token similarity (#2730)

* Less norm computations in token similarity

* Contributor agreement

* Remove ')' for clarity (#2737)

Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.

* added contributor agreement for mbkupfer (#2738)

* Basic support for Telugu language (#2751)

* Lex _attrs for polish language (#2750)

* Signed spaCy contributor agreement

* Added polish version of english lex_attrs

* Introduces a bulk merge function, in order to solve issue #653 (#2696)

* Fix comment

* Introduce bulk merge to increase performance on many span merges

* Sign contributor agreement

* Implement pull request suggestions

* Describe converters more explicitly (see #2643)

* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]

* Fix formatting

* Fix dependency scheme docs (closes #2705) [ci skip]

* Don't set stop word in example (closes #2657) [ci skip]

* Add words to portuguese language _num_words (#2759)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Update Indonesian model (#2752)

* adding e-KTP in tokenizer exceptions list

* add exception token

* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception

* add tokenizer exceptions list

* combining base_norms with norm_exceptions

* adding norm_exception

* fix double key in lemmatizer

* remove unused import on punctuation.py

* reformat stop_words to reduce number of lines, improve readibility

* updating tokenizer exception

* implement is_currency for lang/id

* adding orth_first_upper in tokenizer_exceptions

* update the norm_exception list

* remove bunch of abbreviations

* adding contributors file

* Fixed spaCy+Keras example (#2763)

* bug fixes in keras example

* created contributor agreement

* Adding French hyphenated first name (#2786)

* Fix typo (closes #2784)

* Fix typo (#2795) [ci skip]

Fixed typo on line 6 "regcognizer --> recognizer"

* Adding basic support for Sinhala language. (#2788)

* adding Sinhala language package, stop words, examples and lex_attrs.

* Adding contributor agreement

* Updating contributor agreement

* Also include lowercase norm exceptions

* Fix error (#2802)

* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way.  Use the resize function

* added spaCy Contributor Agreement

* Add charlax's contributor agreement (#2805)

* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)

* Contributors agreement

* Contributors agreement

* Contributors agreement

* Add jupyter=True to displacy.render in documentation (#2806)

* Revert "Also include lowercase norm exceptions"

This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.

* Remove deprecated encoding argument to msgpack

* Set up dependency tree pattern matching skeleton (#2732)

* Fix bug when too many entity types. Fixes #2800

* Fix Python 2 test failure

* Require older msgpack-numpy

* Restore encoding arg on msgpack-numpy

* Try to fix version pin for msgpack-numpy

* Update Portuguese Language (#2790)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols

* Extended punctuation and norm_exceptions in the Portuguese language

* Correct error in spacy universe docs concerning spacy-lookup (#2814)

* Update Keras Example for (Parikh et al, 2016) implementation  (#2803)

* bug fixes in keras example

* created contributor agreement

* baseline for Parikh model

* initial version of parikh 2016 implemented

* tested asymmetric models

* fixed grevious error in normalization

* use standard SNLI test file

* begin to rework parikh example

* initial version of running example

* start to document the new version

* start to document the new version

* Update Decompositional Attention.ipynb

* fixed calls to similarity

* updated the README

* import sys package duh

* simplified indexing on mapping word to IDs

* stupid python indent error

* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround

* Fix typo (closes #2815) [ci skip]

* Update regex version dependency

* Set version to 2.0.13.dev3

* Skip seemingly problematic test

* Remove problematic test

* Try previous version of regex

* Revert "Remove problematic test"

This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.

* Unskip test

* Try older version of regex

* 💫 Update training examples and use minibatching (#2830)

<!--- Provide a general summary of your changes in the title. -->

## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.

### Types of change
enhancements

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Visual C++ link updated (#2842) (closes #2841) [ci skip]

* New landing page

* Add contribution agreement

* Correcting lang/ru/examples.py (#2845)

* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement

* Correct some grammatical inaccuracies in lang\ru\examples.py

* Move contributor agreement to separate file

* Set version to 2.0.13.dev4

* Add Persian(Farsi) language support (#2797)

* Also include lowercase norm exceptions

* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors

* Rule-based French Lemmatizer (#2818)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech 
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Set version to 2.0.13

* Fix formatting and consistency

* Update docs for new version [ci skip]

* Increment version [ci skip]

* Add info on wheels [ci skip]

* Adding "This is a sentence" example to Sinhala (#2846)

* Add wheels badge

* Update badge [ci skip]

* Update README.rst [ci skip]

* Update murmurhash pin

* Increment version to 2.0.14.dev0

* Update GPU docs for v2.0.14

* Add wheel to setup_requires

* Import prefer_gpu and require_gpu functions from Thinc

* Add tests for prefer_gpu() and require_gpu()

* Update requirements and setup.py

* Workaround bug in thinc require_gpu

* Set version to v2.0.14

* Update push-tag script

* Unhack prefer_gpu

* Require thinc 6.10.6

* Update prefer_gpu and require_gpu docs [ci skip]

* Fix specifiers for GPU

* Set version to 2.0.14.dev1

* Set version to 2.0.14

* Update Thinc version pin

* Increment version

* Fix msgpack-numpy version pin

* Increment version

* Update version to 2.0.16

* Update version [ci skip]

* Redundant ')' in the Stop words' example (#2856)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Documentation improvement regarding joblib and SO (#2867)

Some documentation improvements

## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)

### Types of change
Documentation

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* raise error when setting overlapping entities as doc.ents (#2880)

* Fix out-of-bounds access in NER training

The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!

This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.

* Change PyThaiNLP Url (#2876)

* Fix missing comma

* Add example showing a fix-up rule for space entities

* Set version to 2.0.17.dev0

* Update regex version

* Revert "Update regex version"

This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.

* Try setting older regex version, to align with conda

* Set version to 2.0.17

* Add spacy-js to universe [ci-skip]

* Add spacy-raspberry to universe (closes #2889)

* Add script to validate universe json [ci skip]

* Removed space in docs + added contributor indo (#2909)

* - removed unneeded space in documentation

* - added contributor info

* Allow input text of length up to max_length, inclusive (#2922)

* Include universe spec for spacy-wordnet component (#2919)

* feat: include universe spec for spacy-wordnet component

* chore: include spaCy contributor agreement

* Minor formatting changes [ci skip]

* Fix image [ci skip]

Twitter URL doesn't work on live site

* Check if the word is in one of the regular lists specific to each POS (#2886)

* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)

Resolves #2924.

## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)

### Types of change
bug fix

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix typo [ci skip]

* fixes symbolic link on py3 and windows (#2949)

* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948

* Update spacy/compat.py

Co-Authored-By: cicorias <cicorias@users.noreply.github.com>

* Fix formatting

* Update universe [ci skip]

* Catalan Language Support (#2940)

* Catalan language Support

* Ddding Catalan to documentation

* Sort languages alphabetically [ci skip]

* Update tests for pytest 4.x (#2965)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix regex pin to harmonize with conda (#2964)

* Update README.rst

* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)

Fixes #2976

* Fix typo

* Fix typo

* Remove duplicate file

* Require thinc 7.0.0.dev2

Fixes bug in gpu_ops that would use cupy instead of numpy on CPU

* Add missing import

* Fix error IDs

* Fix tests

											
										
										
											2018-11-29 18:30:29 +03:00
+								    def _bulk_merge(self, spans, attributes):
 								        """Retokenize the document, such that the spans given as arguments
 								         are merged into single tokens. The spans need to be in document
 								         order, and no span intersection is allowed.
 								        spans (Span[]): Spans to merge, in document order, with all span
-												Fixes typos (#4843)

* Fixes typos

* Fixes typo

* Contributor agreement

											
										
										
											2019-12-29 16:24:13 +03:00
+								            intersections empty. Cannot be empty.
-												💫 Port master changes over to develop (#2979)

* Create aryaprabhudesai.md (#2681)

* Update _install.jade (#2688)

Typo fix: "models" -> "model"

* Add FAC to spacy.explain (resolves #2706)

* Remove docstrings for deprecated arguments (see #2703)

* When calling getoption() in conftest.py, pass a default option (#2709)

* When calling getoption() in conftest.py, pass a default option

This is necessary to allow testing an installed spacy by running:

  pytest --pyargs spacy

* Add contributor agreement

* update bengali token rules for hyphen and digits (#2731)

* Less norm computations in token similarity (#2730)

* Less norm computations in token similarity

* Contributor agreement

* Remove ')' for clarity (#2737)

Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.

* added contributor agreement for mbkupfer (#2738)

* Basic support for Telugu language (#2751)

* Lex _attrs for polish language (#2750)

* Signed spaCy contributor agreement

* Added polish version of english lex_attrs

* Introduces a bulk merge function, in order to solve issue #653 (#2696)

* Fix comment

* Introduce bulk merge to increase performance on many span merges

* Sign contributor agreement

* Implement pull request suggestions

* Describe converters more explicitly (see #2643)

* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]

* Fix formatting

* Fix dependency scheme docs (closes #2705) [ci skip]

* Don't set stop word in example (closes #2657) [ci skip]

* Add words to portuguese language _num_words (#2759)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Update Indonesian model (#2752)

* adding e-KTP in tokenizer exceptions list

* add exception token

* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception

* add tokenizer exceptions list

* combining base_norms with norm_exceptions

* adding norm_exception

* fix double key in lemmatizer

* remove unused import on punctuation.py

* reformat stop_words to reduce number of lines, improve readibility

* updating tokenizer exception

* implement is_currency for lang/id

* adding orth_first_upper in tokenizer_exceptions

* update the norm_exception list

* remove bunch of abbreviations

* adding contributors file

* Fixed spaCy+Keras example (#2763)

* bug fixes in keras example

* created contributor agreement

* Adding French hyphenated first name (#2786)

* Fix typo (closes #2784)

* Fix typo (#2795) [ci skip]

Fixed typo on line 6 "regcognizer --> recognizer"

* Adding basic support for Sinhala language. (#2788)

* adding Sinhala language package, stop words, examples and lex_attrs.

* Adding contributor agreement

* Updating contributor agreement

* Also include lowercase norm exceptions

* Fix error (#2802)

* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way.  Use the resize function

* added spaCy Contributor Agreement

* Add charlax's contributor agreement (#2805)

* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)

* Contributors agreement

* Contributors agreement

* Contributors agreement

* Add jupyter=True to displacy.render in documentation (#2806)

* Revert "Also include lowercase norm exceptions"

This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.

* Remove deprecated encoding argument to msgpack

* Set up dependency tree pattern matching skeleton (#2732)

* Fix bug when too many entity types. Fixes #2800

* Fix Python 2 test failure

* Require older msgpack-numpy

* Restore encoding arg on msgpack-numpy

* Try to fix version pin for msgpack-numpy

* Update Portuguese Language (#2790)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols

* Extended punctuation and norm_exceptions in the Portuguese language

* Correct error in spacy universe docs concerning spacy-lookup (#2814)

* Update Keras Example for (Parikh et al, 2016) implementation  (#2803)

* bug fixes in keras example

* created contributor agreement

* baseline for Parikh model

* initial version of parikh 2016 implemented

* tested asymmetric models

* fixed grevious error in normalization

* use standard SNLI test file

* begin to rework parikh example

* initial version of running example

* start to document the new version

* start to document the new version

* Update Decompositional Attention.ipynb

* fixed calls to similarity

* updated the README

* import sys package duh

* simplified indexing on mapping word to IDs

* stupid python indent error

* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround

* Fix typo (closes #2815) [ci skip]

* Update regex version dependency

* Set version to 2.0.13.dev3

* Skip seemingly problematic test

* Remove problematic test

* Try previous version of regex

* Revert "Remove problematic test"

This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.

* Unskip test

* Try older version of regex

* 💫 Update training examples and use minibatching (#2830)

<!--- Provide a general summary of your changes in the title. -->

## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.

### Types of change
enhancements

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Visual C++ link updated (#2842) (closes #2841) [ci skip]

* New landing page

* Add contribution agreement

* Correcting lang/ru/examples.py (#2845)

* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement

* Correct some grammatical inaccuracies in lang\ru\examples.py

* Move contributor agreement to separate file

* Set version to 2.0.13.dev4

* Add Persian(Farsi) language support (#2797)

* Also include lowercase norm exceptions

* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors

* Rule-based French Lemmatizer (#2818)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech 
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Set version to 2.0.13

* Fix formatting and consistency

* Update docs for new version [ci skip]

* Increment version [ci skip]

* Add info on wheels [ci skip]

* Adding "This is a sentence" example to Sinhala (#2846)

* Add wheels badge

* Update badge [ci skip]

* Update README.rst [ci skip]

* Update murmurhash pin

* Increment version to 2.0.14.dev0

* Update GPU docs for v2.0.14

* Add wheel to setup_requires

* Import prefer_gpu and require_gpu functions from Thinc

* Add tests for prefer_gpu() and require_gpu()

* Update requirements and setup.py

* Workaround bug in thinc require_gpu

* Set version to v2.0.14

* Update push-tag script

* Unhack prefer_gpu

* Require thinc 6.10.6

* Update prefer_gpu and require_gpu docs [ci skip]

* Fix specifiers for GPU

* Set version to 2.0.14.dev1

* Set version to 2.0.14

* Update Thinc version pin

* Increment version

* Fix msgpack-numpy version pin

* Increment version

* Update version to 2.0.16

* Update version [ci skip]

* Redundant ')' in the Stop words' example (#2856)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Documentation improvement regarding joblib and SO (#2867)

Some documentation improvements

## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)

### Types of change
Documentation

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* raise error when setting overlapping entities as doc.ents (#2880)

* Fix out-of-bounds access in NER training

The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!

This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.

* Change PyThaiNLP Url (#2876)

* Fix missing comma

* Add example showing a fix-up rule for space entities

* Set version to 2.0.17.dev0

* Update regex version

* Revert "Update regex version"

This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.

* Try setting older regex version, to align with conda

* Set version to 2.0.17

* Add spacy-js to universe [ci-skip]

* Add spacy-raspberry to universe (closes #2889)

* Add script to validate universe json [ci skip]

* Removed space in docs + added contributor indo (#2909)

* - removed unneeded space in documentation

* - added contributor info

* Allow input text of length up to max_length, inclusive (#2922)

* Include universe spec for spacy-wordnet component (#2919)

* feat: include universe spec for spacy-wordnet component

* chore: include spaCy contributor agreement

* Minor formatting changes [ci skip]

* Fix image [ci skip]

Twitter URL doesn't work on live site

* Check if the word is in one of the regular lists specific to each POS (#2886)

* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)

Resolves #2924.

## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)

### Types of change
bug fix

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix typo [ci skip]

* fixes symbolic link on py3 and windows (#2949)

* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948

* Update spacy/compat.py

Co-Authored-By: cicorias <cicorias@users.noreply.github.com>

* Fix formatting

* Update universe [ci skip]

* Catalan Language Support (#2940)

* Catalan language Support

* Ddding Catalan to documentation

* Sort languages alphabetically [ci skip]

* Update tests for pytest 4.x (#2965)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix regex pin to harmonize with conda (#2964)

* Update README.rst

* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)

Fixes #2976

* Fix typo

* Fix typo

* Remove duplicate file

* Require thinc 7.0.0.dev2

Fixes bug in gpu_ops that would use cupy instead of numpy on CPU

* Add missing import

* Fix error IDs

* Fix tests

											
										
										
											2018-11-29 18:30:29 +03:00
+								        attributes (Dictionary[]): Attributes to assign to the merged tokens. By default,
-												Fixes typos (#4843)

* Fixes typos

* Fixes typo

* Contributor agreement

											
										
										
											2019-12-29 16:24:13 +03:00
+								            must be the same length as spans, empty dictionaries are allowed.
-												💫 Port master changes over to develop (#2979)

* Create aryaprabhudesai.md (#2681)

* Update _install.jade (#2688)

Typo fix: "models" -> "model"

* Add FAC to spacy.explain (resolves #2706)

* Remove docstrings for deprecated arguments (see #2703)

* When calling getoption() in conftest.py, pass a default option (#2709)

* When calling getoption() in conftest.py, pass a default option

This is necessary to allow testing an installed spacy by running:

  pytest --pyargs spacy

* Add contributor agreement

* update bengali token rules for hyphen and digits (#2731)

* Less norm computations in token similarity (#2730)

* Less norm computations in token similarity

* Contributor agreement

* Remove ')' for clarity (#2737)

Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.

* added contributor agreement for mbkupfer (#2738)

* Basic support for Telugu language (#2751)

* Lex _attrs for polish language (#2750)

* Signed spaCy contributor agreement

* Added polish version of english lex_attrs

* Introduces a bulk merge function, in order to solve issue #653 (#2696)

* Fix comment

* Introduce bulk merge to increase performance on many span merges

* Sign contributor agreement

* Implement pull request suggestions

* Describe converters more explicitly (see #2643)

* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]

* Fix formatting

* Fix dependency scheme docs (closes #2705) [ci skip]

* Don't set stop word in example (closes #2657) [ci skip]

* Add words to portuguese language _num_words (#2759)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Update Indonesian model (#2752)

* adding e-KTP in tokenizer exceptions list

* add exception token

* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception

* add tokenizer exceptions list

* combining base_norms with norm_exceptions

* adding norm_exception

* fix double key in lemmatizer

* remove unused import on punctuation.py

* reformat stop_words to reduce number of lines, improve readibility

* updating tokenizer exception

* implement is_currency for lang/id

* adding orth_first_upper in tokenizer_exceptions

* update the norm_exception list

* remove bunch of abbreviations

* adding contributors file

* Fixed spaCy+Keras example (#2763)

* bug fixes in keras example

* created contributor agreement

* Adding French hyphenated first name (#2786)

* Fix typo (closes #2784)

* Fix typo (#2795) [ci skip]

Fixed typo on line 6 "regcognizer --> recognizer"

* Adding basic support for Sinhala language. (#2788)

* adding Sinhala language package, stop words, examples and lex_attrs.

* Adding contributor agreement

* Updating contributor agreement

* Also include lowercase norm exceptions

* Fix error (#2802)

* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way.  Use the resize function

* added spaCy Contributor Agreement

* Add charlax's contributor agreement (#2805)

* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)

* Contributors agreement

* Contributors agreement

* Contributors agreement

* Add jupyter=True to displacy.render in documentation (#2806)

* Revert "Also include lowercase norm exceptions"

This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.

* Remove deprecated encoding argument to msgpack

* Set up dependency tree pattern matching skeleton (#2732)

* Fix bug when too many entity types. Fixes #2800

* Fix Python 2 test failure

* Require older msgpack-numpy

* Restore encoding arg on msgpack-numpy

* Try to fix version pin for msgpack-numpy

* Update Portuguese Language (#2790)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols

* Extended punctuation and norm_exceptions in the Portuguese language

* Correct error in spacy universe docs concerning spacy-lookup (#2814)

* Update Keras Example for (Parikh et al, 2016) implementation  (#2803)

* bug fixes in keras example

* created contributor agreement

* baseline for Parikh model

* initial version of parikh 2016 implemented

* tested asymmetric models

* fixed grevious error in normalization

* use standard SNLI test file

* begin to rework parikh example

* initial version of running example

* start to document the new version

* start to document the new version

* Update Decompositional Attention.ipynb

* fixed calls to similarity

* updated the README

* import sys package duh

* simplified indexing on mapping word to IDs

* stupid python indent error

* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround

* Fix typo (closes #2815) [ci skip]

* Update regex version dependency

* Set version to 2.0.13.dev3

* Skip seemingly problematic test

* Remove problematic test

* Try previous version of regex

* Revert "Remove problematic test"

This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.

* Unskip test

* Try older version of regex

* 💫 Update training examples and use minibatching (#2830)

<!--- Provide a general summary of your changes in the title. -->

## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.

### Types of change
enhancements

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Visual C++ link updated (#2842) (closes #2841) [ci skip]

* New landing page

* Add contribution agreement

* Correcting lang/ru/examples.py (#2845)

* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement

* Correct some grammatical inaccuracies in lang\ru\examples.py

* Move contributor agreement to separate file

* Set version to 2.0.13.dev4

* Add Persian(Farsi) language support (#2797)

* Also include lowercase norm exceptions

* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors

* Rule-based French Lemmatizer (#2818)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech 
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Set version to 2.0.13

* Fix formatting and consistency

* Update docs for new version [ci skip]

* Increment version [ci skip]

* Add info on wheels [ci skip]

* Adding "This is a sentence" example to Sinhala (#2846)

* Add wheels badge

* Update badge [ci skip]

* Update README.rst [ci skip]

* Update murmurhash pin

* Increment version to 2.0.14.dev0

* Update GPU docs for v2.0.14

* Add wheel to setup_requires

* Import prefer_gpu and require_gpu functions from Thinc

* Add tests for prefer_gpu() and require_gpu()

* Update requirements and setup.py

* Workaround bug in thinc require_gpu

* Set version to v2.0.14

* Update push-tag script

* Unhack prefer_gpu

* Require thinc 6.10.6

* Update prefer_gpu and require_gpu docs [ci skip]

* Fix specifiers for GPU

* Set version to 2.0.14.dev1

* Set version to 2.0.14

* Update Thinc version pin

* Increment version

* Fix msgpack-numpy version pin

* Increment version

* Update version to 2.0.16

* Update version [ci skip]

* Redundant ')' in the Stop words' example (#2856)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Documentation improvement regarding joblib and SO (#2867)

Some documentation improvements

## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)

### Types of change
Documentation

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* raise error when setting overlapping entities as doc.ents (#2880)

* Fix out-of-bounds access in NER training

The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!

This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.

* Change PyThaiNLP Url (#2876)

* Fix missing comma

* Add example showing a fix-up rule for space entities

* Set version to 2.0.17.dev0

* Update regex version

* Revert "Update regex version"

This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.

* Try setting older regex version, to align with conda

* Set version to 2.0.17

* Add spacy-js to universe [ci-skip]

* Add spacy-raspberry to universe (closes #2889)

* Add script to validate universe json [ci skip]

* Removed space in docs + added contributor indo (#2909)

* - removed unneeded space in documentation

* - added contributor info

* Allow input text of length up to max_length, inclusive (#2922)

* Include universe spec for spacy-wordnet component (#2919)

* feat: include universe spec for spacy-wordnet component

* chore: include spaCy contributor agreement

* Minor formatting changes [ci skip]

* Fix image [ci skip]

Twitter URL doesn't work on live site

* Check if the word is in one of the regular lists specific to each POS (#2886)

* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)

Resolves #2924.

## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)

### Types of change
bug fix

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix typo [ci skip]

* fixes symbolic link on py3 and windows (#2949)

* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948

* Update spacy/compat.py

Co-Authored-By: cicorias <cicorias@users.noreply.github.com>

* Fix formatting

* Update universe [ci skip]

* Catalan Language Support (#2940)

* Catalan language Support

* Ddding Catalan to documentation

* Sort languages alphabetically [ci skip]

* Update tests for pytest 4.x (#2965)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix regex pin to harmonize with conda (#2964)

* Update README.rst

* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)

Fixes #2976

* Fix typo

* Fix typo

* Remove duplicate file

* Require thinc 7.0.0.dev2

Fixes bug in gpu_ops that would use cupy instead of numpy on CPU

* Add missing import

* Fix error IDs

* Fix tests

											
										
										
											2018-11-29 18:30:29 +03:00
+								            attributes are inherited from the syntactic root of the span.
 								        RETURNS (Token): The first newly merged token.
 								        """
 								        cdef unicode tag, lemma, ent_type
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        attr_len = len(attributes)
 								        span_len = len(spans)
 								        if not attr_len == span_len:
 								            raise ValueError(Errors.E121.format(attr_len=attr_len, span_len=span_len))
-												💫 Port master changes over to develop (#2979)

* Create aryaprabhudesai.md (#2681)

* Update _install.jade (#2688)

Typo fix: "models" -> "model"

* Add FAC to spacy.explain (resolves #2706)

* Remove docstrings for deprecated arguments (see #2703)

* When calling getoption() in conftest.py, pass a default option (#2709)

* When calling getoption() in conftest.py, pass a default option

This is necessary to allow testing an installed spacy by running:

  pytest --pyargs spacy

* Add contributor agreement

* update bengali token rules for hyphen and digits (#2731)

* Less norm computations in token similarity (#2730)

* Less norm computations in token similarity

* Contributor agreement

* Remove ')' for clarity (#2737)

Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.

* added contributor agreement for mbkupfer (#2738)

* Basic support for Telugu language (#2751)

* Lex _attrs for polish language (#2750)

* Signed spaCy contributor agreement

* Added polish version of english lex_attrs

* Introduces a bulk merge function, in order to solve issue #653 (#2696)

* Fix comment

* Introduce bulk merge to increase performance on many span merges

* Sign contributor agreement

* Implement pull request suggestions

* Describe converters more explicitly (see #2643)

* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]

* Fix formatting

* Fix dependency scheme docs (closes #2705) [ci skip]

* Don't set stop word in example (closes #2657) [ci skip]

* Add words to portuguese language _num_words (#2759)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Update Indonesian model (#2752)

* adding e-KTP in tokenizer exceptions list

* add exception token

* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception

* add tokenizer exceptions list

* combining base_norms with norm_exceptions

* adding norm_exception

* fix double key in lemmatizer

* remove unused import on punctuation.py

* reformat stop_words to reduce number of lines, improve readibility

* updating tokenizer exception

* implement is_currency for lang/id

* adding orth_first_upper in tokenizer_exceptions

* update the norm_exception list

* remove bunch of abbreviations

* adding contributors file

* Fixed spaCy+Keras example (#2763)

* bug fixes in keras example

* created contributor agreement

* Adding French hyphenated first name (#2786)

* Fix typo (closes #2784)

* Fix typo (#2795) [ci skip]

Fixed typo on line 6 "regcognizer --> recognizer"

* Adding basic support for Sinhala language. (#2788)

* adding Sinhala language package, stop words, examples and lex_attrs.

* Adding contributor agreement

* Updating contributor agreement

* Also include lowercase norm exceptions

* Fix error (#2802)

* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way.  Use the resize function

* added spaCy Contributor Agreement

* Add charlax's contributor agreement (#2805)

* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)

* Contributors agreement

* Contributors agreement

* Contributors agreement

* Add jupyter=True to displacy.render in documentation (#2806)

* Revert "Also include lowercase norm exceptions"

This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.

* Remove deprecated encoding argument to msgpack

* Set up dependency tree pattern matching skeleton (#2732)

* Fix bug when too many entity types. Fixes #2800

* Fix Python 2 test failure

* Require older msgpack-numpy

* Restore encoding arg on msgpack-numpy

* Try to fix version pin for msgpack-numpy

* Update Portuguese Language (#2790)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols

* Extended punctuation and norm_exceptions in the Portuguese language

* Correct error in spacy universe docs concerning spacy-lookup (#2814)

* Update Keras Example for (Parikh et al, 2016) implementation  (#2803)

* bug fixes in keras example

* created contributor agreement

* baseline for Parikh model

* initial version of parikh 2016 implemented

* tested asymmetric models

* fixed grevious error in normalization

* use standard SNLI test file

* begin to rework parikh example

* initial version of running example

* start to document the new version

* start to document the new version

* Update Decompositional Attention.ipynb

* fixed calls to similarity

* updated the README

* import sys package duh

* simplified indexing on mapping word to IDs

* stupid python indent error

* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround

* Fix typo (closes #2815) [ci skip]

* Update regex version dependency

* Set version to 2.0.13.dev3

* Skip seemingly problematic test

* Remove problematic test

* Try previous version of regex

* Revert "Remove problematic test"

This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.

* Unskip test

* Try older version of regex

* 💫 Update training examples and use minibatching (#2830)

<!--- Provide a general summary of your changes in the title. -->

## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.

### Types of change
enhancements

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Visual C++ link updated (#2842) (closes #2841) [ci skip]

* New landing page

* Add contribution agreement

* Correcting lang/ru/examples.py (#2845)

* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement

* Correct some grammatical inaccuracies in lang\ru\examples.py

* Move contributor agreement to separate file

* Set version to 2.0.13.dev4

* Add Persian(Farsi) language support (#2797)

* Also include lowercase norm exceptions

* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors

* Rule-based French Lemmatizer (#2818)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech 
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Set version to 2.0.13

* Fix formatting and consistency

* Update docs for new version [ci skip]

* Increment version [ci skip]

* Add info on wheels [ci skip]

* Adding "This is a sentence" example to Sinhala (#2846)

* Add wheels badge

* Update badge [ci skip]

* Update README.rst [ci skip]

* Update murmurhash pin

* Increment version to 2.0.14.dev0

* Update GPU docs for v2.0.14

* Add wheel to setup_requires

* Import prefer_gpu and require_gpu functions from Thinc

* Add tests for prefer_gpu() and require_gpu()

* Update requirements and setup.py

* Workaround bug in thinc require_gpu

* Set version to v2.0.14

* Update push-tag script

* Unhack prefer_gpu

* Require thinc 6.10.6

* Update prefer_gpu and require_gpu docs [ci skip]

* Fix specifiers for GPU

* Set version to 2.0.14.dev1

* Set version to 2.0.14

* Update Thinc version pin

* Increment version

* Fix msgpack-numpy version pin

* Increment version

* Update version to 2.0.16

* Update version [ci skip]

* Redundant ')' in the Stop words' example (#2856)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Documentation improvement regarding joblib and SO (#2867)

Some documentation improvements

## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)

### Types of change
Documentation

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* raise error when setting overlapping entities as doc.ents (#2880)

* Fix out-of-bounds access in NER training

The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!

This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.

* Change PyThaiNLP Url (#2876)

* Fix missing comma

* Add example showing a fix-up rule for space entities

* Set version to 2.0.17.dev0

* Update regex version

* Revert "Update regex version"

This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.

* Try setting older regex version, to align with conda

* Set version to 2.0.17

* Add spacy-js to universe [ci-skip]

* Add spacy-raspberry to universe (closes #2889)

* Add script to validate universe json [ci skip]

* Removed space in docs + added contributor indo (#2909)

* - removed unneeded space in documentation

* - added contributor info

* Allow input text of length up to max_length, inclusive (#2922)

* Include universe spec for spacy-wordnet component (#2919)

* feat: include universe spec for spacy-wordnet component

* chore: include spaCy contributor agreement

* Minor formatting changes [ci skip]

* Fix image [ci skip]

Twitter URL doesn't work on live site

* Check if the word is in one of the regular lists specific to each POS (#2886)

* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)

Resolves #2924.

## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)

### Types of change
bug fix

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix typo [ci skip]

* fixes symbolic link on py3 and windows (#2949)

* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948

* Update spacy/compat.py

Co-Authored-By: cicorias <cicorias@users.noreply.github.com>

* Fix formatting

* Update universe [ci skip]

* Catalan Language Support (#2940)

* Catalan language Support

* Ddding Catalan to documentation

* Sort languages alphabetically [ci skip]

* Update tests for pytest 4.x (#2965)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix regex pin to harmonize with conda (#2964)

* Update README.rst

* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)

Fixes #2976

* Fix typo

* Fix typo

* Remove duplicate file

* Require thinc 7.0.0.dev2

Fixes bug in gpu_ops that would use cupy instead of numpy on CPU

* Add missing import

* Fix error IDs

* Fix tests

											
										
										
											2018-11-29 18:30:29 +03:00
+								        with self.retokenize() as retokenizer:
 								            for i, span in enumerate(spans):
 								                fix_attributes(self, attributes[i])
 								                remove_label_if_necessary(attributes[i])
 								                retokenizer.merge(span, attributes[i])
-												Improve API for doc.merge() and span.merge(), to use keyword arguments.

											
										
										
											2016-10-17 15:02:13 +03:00
+								    def merge(self, int start_idx, int end_idx, *args, **attributes):
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        """Retokenize the document, such that the span at
 								        `doc.text[start_idx : end_idx]` is merged into a single token. If
 								        `start_idx` and `end_idx `do not mark start and end token boundaries,
 								        the document remains unchanged.
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								        start_idx (int): Character index of the start of the slice to merge.
 								        end_idx (int): Character index after the end of the slice to merge.
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
+								        **attributes: Attributes to assign to the merged token. By default,
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
+								            attributes are inherited from the syntactic root of the span.
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
+								        RETURNS (Token): The newly merged token, or `None` if the start and end
 								            indices did not fall at token boundaries.
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
+								        """
-												Improve API for doc.merge() and span.merge(), to use keyword arguments.

											
										
										
											2016-10-17 15:02:13 +03:00
+								        cdef unicode tag, lemma, ent_type
-												Simplify warnings

											
										
										
											2020-04-28 14:37:37 +03:00
+								        warnings.warn(Warnings.W013.format(obj="Doc"), DeprecationWarning)
-												ensure Span.as_doc keeps the entity links + unit test

											
										
										
											2019-06-25 16:28:51 +03:00
+								        # TODO: ENT_KB_ID ?
-												Improve API for doc.merge() and span.merge(), to use keyword arguments.

											
										
										
											2016-10-17 15:02:13 +03:00
+								        if len(args) == 3:
-												Simplify warnings

											
										
										
											2020-04-28 14:37:37 +03:00
+								            warnings.warn(Warnings.W003, DeprecationWarning)
-												Improve API for doc.merge() and span.merge(), to use keyword arguments.

											
										
										
											2016-10-17 15:02:13 +03:00
+								            tag, lemma, ent_type = args
-												WIP on stringstore change. 27 failures

											
										
										
											2017-05-28 15:06:40 +03:00
+								            attributes[TAG] = tag
 								            attributes[LEMMA] = lemma
 								            attributes[ENT_TYPE] = ent_type
-												Add option to use label karg to determine ent_type in doc.merge

											
										
										
											2017-03-29 09:35:03 +03:00
+								        elif not args:
-												💫 Port master changes over to develop (#2979)

* Create aryaprabhudesai.md (#2681)

* Update _install.jade (#2688)

Typo fix: "models" -> "model"

* Add FAC to spacy.explain (resolves #2706)

* Remove docstrings for deprecated arguments (see #2703)

* When calling getoption() in conftest.py, pass a default option (#2709)

* When calling getoption() in conftest.py, pass a default option

This is necessary to allow testing an installed spacy by running:

  pytest --pyargs spacy

* Add contributor agreement

* update bengali token rules for hyphen and digits (#2731)

* Less norm computations in token similarity (#2730)

* Less norm computations in token similarity

* Contributor agreement

* Remove ')' for clarity (#2737)

Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.

* added contributor agreement for mbkupfer (#2738)

* Basic support for Telugu language (#2751)

* Lex _attrs for polish language (#2750)

* Signed spaCy contributor agreement

* Added polish version of english lex_attrs

* Introduces a bulk merge function, in order to solve issue #653 (#2696)

* Fix comment

* Introduce bulk merge to increase performance on many span merges

* Sign contributor agreement

* Implement pull request suggestions

* Describe converters more explicitly (see #2643)

* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]

* Fix formatting

* Fix dependency scheme docs (closes #2705) [ci skip]

* Don't set stop word in example (closes #2657) [ci skip]

* Add words to portuguese language _num_words (#2759)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Update Indonesian model (#2752)

* adding e-KTP in tokenizer exceptions list

* add exception token

* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception

* add tokenizer exceptions list

* combining base_norms with norm_exceptions

* adding norm_exception

* fix double key in lemmatizer

* remove unused import on punctuation.py

* reformat stop_words to reduce number of lines, improve readibility

* updating tokenizer exception

* implement is_currency for lang/id

* adding orth_first_upper in tokenizer_exceptions

* update the norm_exception list

* remove bunch of abbreviations

* adding contributors file

* Fixed spaCy+Keras example (#2763)

* bug fixes in keras example

* created contributor agreement

* Adding French hyphenated first name (#2786)

* Fix typo (closes #2784)

* Fix typo (#2795) [ci skip]

Fixed typo on line 6 "regcognizer --> recognizer"

* Adding basic support for Sinhala language. (#2788)

* adding Sinhala language package, stop words, examples and lex_attrs.

* Adding contributor agreement

* Updating contributor agreement

* Also include lowercase norm exceptions

* Fix error (#2802)

* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way.  Use the resize function

* added spaCy Contributor Agreement

* Add charlax's contributor agreement (#2805)

* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)

* Contributors agreement

* Contributors agreement

* Contributors agreement

* Add jupyter=True to displacy.render in documentation (#2806)

* Revert "Also include lowercase norm exceptions"

This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.

* Remove deprecated encoding argument to msgpack

* Set up dependency tree pattern matching skeleton (#2732)

* Fix bug when too many entity types. Fixes #2800

* Fix Python 2 test failure

* Require older msgpack-numpy

* Restore encoding arg on msgpack-numpy

* Try to fix version pin for msgpack-numpy

* Update Portuguese Language (#2790)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols

* Extended punctuation and norm_exceptions in the Portuguese language

* Correct error in spacy universe docs concerning spacy-lookup (#2814)

* Update Keras Example for (Parikh et al, 2016) implementation  (#2803)

* bug fixes in keras example

* created contributor agreement

* baseline for Parikh model

* initial version of parikh 2016 implemented

* tested asymmetric models

* fixed grevious error in normalization

* use standard SNLI test file

* begin to rework parikh example

* initial version of running example

* start to document the new version

* start to document the new version

* Update Decompositional Attention.ipynb

* fixed calls to similarity

* updated the README

* import sys package duh

* simplified indexing on mapping word to IDs

* stupid python indent error

* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround

* Fix typo (closes #2815) [ci skip]

* Update regex version dependency

* Set version to 2.0.13.dev3

* Skip seemingly problematic test

* Remove problematic test

* Try previous version of regex

* Revert "Remove problematic test"

This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.

* Unskip test

* Try older version of regex

* 💫 Update training examples and use minibatching (#2830)

<!--- Provide a general summary of your changes in the title. -->

## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.

### Types of change
enhancements

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Visual C++ link updated (#2842) (closes #2841) [ci skip]

* New landing page

* Add contribution agreement

* Correcting lang/ru/examples.py (#2845)

* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement

* Correct some grammatical inaccuracies in lang\ru\examples.py

* Move contributor agreement to separate file

* Set version to 2.0.13.dev4

* Add Persian(Farsi) language support (#2797)

* Also include lowercase norm exceptions

* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors

* Rule-based French Lemmatizer (#2818)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech 
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Set version to 2.0.13

* Fix formatting and consistency

* Update docs for new version [ci skip]

* Increment version [ci skip]

* Add info on wheels [ci skip]

* Adding "This is a sentence" example to Sinhala (#2846)

* Add wheels badge

* Update badge [ci skip]

* Update README.rst [ci skip]

* Update murmurhash pin

* Increment version to 2.0.14.dev0

* Update GPU docs for v2.0.14

* Add wheel to setup_requires

* Import prefer_gpu and require_gpu functions from Thinc

* Add tests for prefer_gpu() and require_gpu()

* Update requirements and setup.py

* Workaround bug in thinc require_gpu

* Set version to v2.0.14

* Update push-tag script

* Unhack prefer_gpu

* Require thinc 6.10.6

* Update prefer_gpu and require_gpu docs [ci skip]

* Fix specifiers for GPU

* Set version to 2.0.14.dev1

* Set version to 2.0.14

* Update Thinc version pin

* Increment version

* Fix msgpack-numpy version pin

* Increment version

* Update version to 2.0.16

* Update version [ci skip]

* Redundant ')' in the Stop words' example (#2856)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Documentation improvement regarding joblib and SO (#2867)

Some documentation improvements

## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)

### Types of change
Documentation

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* raise error when setting overlapping entities as doc.ents (#2880)

* Fix out-of-bounds access in NER training

The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!

This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.

* Change PyThaiNLP Url (#2876)

* Fix missing comma

* Add example showing a fix-up rule for space entities

* Set version to 2.0.17.dev0

* Update regex version

* Revert "Update regex version"

This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.

* Try setting older regex version, to align with conda

* Set version to 2.0.17

* Add spacy-js to universe [ci-skip]

* Add spacy-raspberry to universe (closes #2889)

* Add script to validate universe json [ci skip]

* Removed space in docs + added contributor indo (#2909)

* - removed unneeded space in documentation

* - added contributor info

* Allow input text of length up to max_length, inclusive (#2922)

* Include universe spec for spacy-wordnet component (#2919)

* feat: include universe spec for spacy-wordnet component

* chore: include spaCy contributor agreement

* Minor formatting changes [ci skip]

* Fix image [ci skip]

Twitter URL doesn't work on live site

* Check if the word is in one of the regular lists specific to each POS (#2886)

* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)

Resolves #2924.

## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)

### Types of change
bug fix

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix typo [ci skip]

* fixes symbolic link on py3 and windows (#2949)

* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948

* Update spacy/compat.py

Co-Authored-By: cicorias <cicorias@users.noreply.github.com>

* Fix formatting

* Update universe [ci skip]

* Catalan Language Support (#2940)

* Catalan language Support

* Ddding Catalan to documentation

* Sort languages alphabetically [ci skip]

* Update tests for pytest 4.x (#2965)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix regex pin to harmonize with conda (#2964)

* Update README.rst

* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)

Fixes #2976

* Fix typo

* Fix typo

* Remove duplicate file

* Require thinc 7.0.0.dev2

Fixes bug in gpu_ops that would use cupy instead of numpy on CPU

* Add missing import

* Fix error IDs

* Fix tests

											
										
										
											2018-11-29 18:30:29 +03:00
+								            fix_attributes(self, attributes)
-												Improve API for doc.merge() and span.merge(), to use keyword arguments.

											
										
										
											2016-10-17 15:02:13 +03:00
+								        elif args:
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								            raise ValueError(Errors.E034.format(n_args=len(args), args=repr(args),
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								                                                kwargs=repr(attributes)))
-												💫 Port master changes over to develop (#2979)

* Create aryaprabhudesai.md (#2681)

* Update _install.jade (#2688)

Typo fix: "models" -> "model"

* Add FAC to spacy.explain (resolves #2706)

* Remove docstrings for deprecated arguments (see #2703)

* When calling getoption() in conftest.py, pass a default option (#2709)

* When calling getoption() in conftest.py, pass a default option

This is necessary to allow testing an installed spacy by running:

  pytest --pyargs spacy

* Add contributor agreement

* update bengali token rules for hyphen and digits (#2731)

* Less norm computations in token similarity (#2730)

* Less norm computations in token similarity

* Contributor agreement

* Remove ')' for clarity (#2737)

Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.

* added contributor agreement for mbkupfer (#2738)

* Basic support for Telugu language (#2751)

* Lex _attrs for polish language (#2750)

* Signed spaCy contributor agreement

* Added polish version of english lex_attrs

* Introduces a bulk merge function, in order to solve issue #653 (#2696)

* Fix comment

* Introduce bulk merge to increase performance on many span merges

* Sign contributor agreement

* Implement pull request suggestions

* Describe converters more explicitly (see #2643)

* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]

* Fix formatting

* Fix dependency scheme docs (closes #2705) [ci skip]

* Don't set stop word in example (closes #2657) [ci skip]

* Add words to portuguese language _num_words (#2759)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Update Indonesian model (#2752)

* adding e-KTP in tokenizer exceptions list

* add exception token

* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception

* add tokenizer exceptions list

* combining base_norms with norm_exceptions

* adding norm_exception

* fix double key in lemmatizer

* remove unused import on punctuation.py

* reformat stop_words to reduce number of lines, improve readibility

* updating tokenizer exception

* implement is_currency for lang/id

* adding orth_first_upper in tokenizer_exceptions

* update the norm_exception list

* remove bunch of abbreviations

* adding contributors file

* Fixed spaCy+Keras example (#2763)

* bug fixes in keras example

* created contributor agreement

* Adding French hyphenated first name (#2786)

* Fix typo (closes #2784)

* Fix typo (#2795) [ci skip]

Fixed typo on line 6 "regcognizer --> recognizer"

* Adding basic support for Sinhala language. (#2788)

* adding Sinhala language package, stop words, examples and lex_attrs.

* Adding contributor agreement

* Updating contributor agreement

* Also include lowercase norm exceptions

* Fix error (#2802)

* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way.  Use the resize function

* added spaCy Contributor Agreement

* Add charlax's contributor agreement (#2805)

* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)

* Contributors agreement

* Contributors agreement

* Contributors agreement

* Add jupyter=True to displacy.render in documentation (#2806)

* Revert "Also include lowercase norm exceptions"

This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.

* Remove deprecated encoding argument to msgpack

* Set up dependency tree pattern matching skeleton (#2732)

* Fix bug when too many entity types. Fixes #2800

* Fix Python 2 test failure

* Require older msgpack-numpy

* Restore encoding arg on msgpack-numpy

* Try to fix version pin for msgpack-numpy

* Update Portuguese Language (#2790)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols

* Extended punctuation and norm_exceptions in the Portuguese language

* Correct error in spacy universe docs concerning spacy-lookup (#2814)

* Update Keras Example for (Parikh et al, 2016) implementation  (#2803)

* bug fixes in keras example

* created contributor agreement

* baseline for Parikh model

* initial version of parikh 2016 implemented

* tested asymmetric models

* fixed grevious error in normalization

* use standard SNLI test file

* begin to rework parikh example

* initial version of running example

* start to document the new version

* start to document the new version

* Update Decompositional Attention.ipynb

* fixed calls to similarity

* updated the README

* import sys package duh

* simplified indexing on mapping word to IDs

* stupid python indent error

* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround

* Fix typo (closes #2815) [ci skip]

* Update regex version dependency

* Set version to 2.0.13.dev3

* Skip seemingly problematic test

* Remove problematic test

* Try previous version of regex

* Revert "Remove problematic test"

This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.

* Unskip test

* Try older version of regex

* 💫 Update training examples and use minibatching (#2830)

<!--- Provide a general summary of your changes in the title. -->

## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.

### Types of change
enhancements

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Visual C++ link updated (#2842) (closes #2841) [ci skip]

* New landing page

* Add contribution agreement

* Correcting lang/ru/examples.py (#2845)

* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement

* Correct some grammatical inaccuracies in lang\ru\examples.py

* Move contributor agreement to separate file

* Set version to 2.0.13.dev4

* Add Persian(Farsi) language support (#2797)

* Also include lowercase norm exceptions

* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors

* Rule-based French Lemmatizer (#2818)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech 
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Set version to 2.0.13

* Fix formatting and consistency

* Update docs for new version [ci skip]

* Increment version [ci skip]

* Add info on wheels [ci skip]

* Adding "This is a sentence" example to Sinhala (#2846)

* Add wheels badge

* Update badge [ci skip]

* Update README.rst [ci skip]

* Update murmurhash pin

* Increment version to 2.0.14.dev0

* Update GPU docs for v2.0.14

* Add wheel to setup_requires

* Import prefer_gpu and require_gpu functions from Thinc

* Add tests for prefer_gpu() and require_gpu()

* Update requirements and setup.py

* Workaround bug in thinc require_gpu

* Set version to v2.0.14

* Update push-tag script

* Unhack prefer_gpu

* Require thinc 6.10.6

* Update prefer_gpu and require_gpu docs [ci skip]

* Fix specifiers for GPU

* Set version to 2.0.14.dev1

* Set version to 2.0.14

* Update Thinc version pin

* Increment version

* Fix msgpack-numpy version pin

* Increment version

* Update version to 2.0.16

* Update version [ci skip]

* Redundant ')' in the Stop words' example (#2856)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Documentation improvement regarding joblib and SO (#2867)

Some documentation improvements

## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)

### Types of change
Documentation

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* raise error when setting overlapping entities as doc.ents (#2880)

* Fix out-of-bounds access in NER training

The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!

This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.

* Change PyThaiNLP Url (#2876)

* Fix missing comma

* Add example showing a fix-up rule for space entities

* Set version to 2.0.17.dev0

* Update regex version

* Revert "Update regex version"

This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.

* Try setting older regex version, to align with conda

* Set version to 2.0.17

* Add spacy-js to universe [ci-skip]

* Add spacy-raspberry to universe (closes #2889)

* Add script to validate universe json [ci skip]

* Removed space in docs + added contributor indo (#2909)

* - removed unneeded space in documentation

* - added contributor info

* Allow input text of length up to max_length, inclusive (#2922)

* Include universe spec for spacy-wordnet component (#2919)

* feat: include universe spec for spacy-wordnet component

* chore: include spaCy contributor agreement

* Minor formatting changes [ci skip]

* Fix image [ci skip]

Twitter URL doesn't work on live site

* Check if the word is in one of the regular lists specific to each POS (#2886)

* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)

Resolves #2924.

## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)

### Types of change
bug fix

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix typo [ci skip]

* fixes symbolic link on py3 and windows (#2949)

* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948

* Update spacy/compat.py

Co-Authored-By: cicorias <cicorias@users.noreply.github.com>

* Fix formatting

* Update universe [ci skip]

* Catalan Language Support (#2940)

* Catalan language Support

* Ddding Catalan to documentation

* Sort languages alphabetically [ci skip]

* Update tests for pytest 4.x (#2965)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix regex pin to harmonize with conda (#2964)

* Update README.rst

* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)

Fixes #2976

* Fix typo

* Fix typo

* Remove duplicate file

* Require thinc 7.0.0.dev2

Fixes bug in gpu_ops that would use cupy instead of numpy on CPU

* Add missing import

* Fix error IDs

* Fix tests

											
										
										
											2018-11-29 18:30:29 +03:00
+								        remove_label_if_necessary(attributes)
-												WIP on stringstore change. 27 failures

											
										
										
											2017-05-28 15:06:40 +03:00
+								        attributes = intify_attrs(attributes, strings_map=self.vocab.strings)
-												* Rework the Span-merge patch, to avoid extending the interface of Doc, and avoid virtualizing the Span.start and Span.end indices, to keep Span usage efficient

											
										
										
											2015-11-07 00:55:34 +03:00
+								        cdef int start = token_by_start(self.c, self.length, start_idx)
 								        if start == -1:
-												* merge add lex last - add index finder funcs

											
										
										
											2015-11-05 18:28:08 +03:00
+								            return None
-												* Rework the Span-merge patch, to avoid extending the interface of Doc, and avoid virtualizing the Span.start and Span.end indices, to keep Span usage efficient

											
										
										
											2015-11-07 00:55:34 +03:00
+								        cdef int end = token_by_end(self.c, self.length, end_idx)
 								        if end == -1:
 								            return None
 								        # Currently we have the token index, we want the range-end index
 								        end += 1
-												Add doc.retokenize() context manager (#2172)

This patch takes a step towards #1487 by introducing the
doc.retokenize() context manager, to handle merging spans, and soon
splitting tokens.

The idea is to do merging and splitting like this:

with doc.retokenize() as retokenizer:
    for start, end, label in matches:
        retokenizer.merge(doc[start : end], attrs={'ent_type': label})

The retokenizer accumulates the merge requests, and applies them
together at the end of the block. This will allow retokenization to be
more efficient, and much less error prone.

A retokenizer.split() function will then be added, to handle splitting a
single token into multiple tokens. These methods take `Span` and `Token`
objects; if the user wants to go directly from offsets, they can append
to the .merges and .splits lists on the retokenizer.

The doc.merge() method's behaviour remains unchanged, so this patch
should be 100% backwards incompatible (modulo bugs). Internally,
doc.merge() fixes up the arguments (to handle the various deprecated styles),
opens the retokenizer, and makes the single merge.

We can later start making deprecation warnings on direct calls to doc.merge(),
to migrate people to use of the retokenize context manager.
											
										
										
											2018-04-03 15:10:35 +03:00
+								        with self.retokenize() as retokenizer:
 								            retokenizer.merge(self[start:end], attrs=attributes)
-												* Add spacy/tokens/doc.pyx, for Doc class in its own file

											
										
										
											2015-07-13 20:58:26 +03:00
+								        return self[start]
-												* Add noun_chunks iterator, and fix left/right child setting in Doc.merge

											
										
										
											2015-07-30 03:29:49 +03:00
-												move parse_tree logic to a new tokens/printers.py file

											
										
										
											2016-12-30 20:19:18 +03:00
+								    def print_tree(self, light=False, flat=False):
-												💫 New JSON helpers, training data internals & CLI rewrite (#2932)

* Support nowrap setting in util.prints

* Tidy up and fix whitespace

* Simplify script and use read_jsonl helper

* Add JSON schemas (see #2928)

* Deprecate Doc.print_tree

Will be replaced with Doc.to_json, which will produce a unified format

* Add Doc.to_json() method (see #2928)

Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space.

* Remove outdated test

* Add write_json and write_jsonl helpers

* WIP: Update spacy train

* Tidy up spacy train

* WIP: Use wasabi for formatting

* Add GoldParse helpers for JSON format

* WIP: add debug-data command

* Fix typo

* Add missing import

* Update wasabi pin

* Add missing import

* 💫 Refactor CLI (#2943)

To be merged into #2932.

## Description
- [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi)
- [x] use [`black`](https://github.com/ambv/black) for auto-formatting
- [x] add `flake8` config
- [x] move all messy UD-related scripts to `cli.ud`
- [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO)

### Types of change
enhancement

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Update wasabi pin

* Delete old test

* Update errors

* Fix typo

* Tidy up and format remaining code

* Fix formatting

* Improve formatting of messages

* Auto-format remaining code

* Add tok2vec stuff to spacy.train

* Fix typo

* Update wasabi pin

* Fix path checks for when train() is called as function

* Reformat and tidy up pretrain script

* Update argument annotations

* Raise error if model language doesn't match lang

* Document new train command

											
										
										
											2018-11-30 22:16:14 +03:00
+								        raise ValueError(Errors.E105)
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
-												💫 New JSON helpers, training data internals & CLI rewrite (#2932)

* Support nowrap setting in util.prints

* Tidy up and fix whitespace

* Simplify script and use read_jsonl helper

* Add JSON schemas (see #2928)

* Deprecate Doc.print_tree

Will be replaced with Doc.to_json, which will produce a unified format

* Add Doc.to_json() method (see #2928)

Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space.

* Remove outdated test

* Add write_json and write_jsonl helpers

* WIP: Update spacy train

* Tidy up spacy train

* WIP: Use wasabi for formatting

* Add GoldParse helpers for JSON format

* WIP: add debug-data command

* Fix typo

* Add missing import

* Update wasabi pin

* Add missing import

* 💫 Refactor CLI (#2943)

To be merged into #2932.

## Description
- [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi)
- [x] use [`black`](https://github.com/ambv/black) for auto-formatting
- [x] add `flake8` config
- [x] move all messy UD-related scripts to `cli.ud`
- [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO)

### Types of change
enhancement

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Update wasabi pin

* Delete old test

* Update errors

* Fix typo

* Tidy up and format remaining code

* Fix formatting

* Improve formatting of messages

* Auto-format remaining code

* Add tok2vec stuff to spacy.train

* Fix typo

* Update wasabi pin

* Fix path checks for when train() is called as function

* Reformat and tidy up pretrain script

* Update argument annotations

* Raise error if model language doesn't match lang

* Document new train command

											
										
										
											2018-11-30 22:16:14 +03:00
+								    def to_json(self, underscore=None):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        """Convert a Doc to JSON. The format it produces will be the new format
 								        for the `spacy train` command (not implemented yet).
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
-												💫 New JSON helpers, training data internals & CLI rewrite (#2932)

* Support nowrap setting in util.prints

* Tidy up and fix whitespace

* Simplify script and use read_jsonl helper

* Add JSON schemas (see #2928)

* Deprecate Doc.print_tree

Will be replaced with Doc.to_json, which will produce a unified format

* Add Doc.to_json() method (see #2928)

Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space.

* Remove outdated test

* Add write_json and write_jsonl helpers

* WIP: Update spacy train

* Tidy up spacy train

* WIP: Use wasabi for formatting

* Add GoldParse helpers for JSON format

* WIP: add debug-data command

* Fix typo

* Add missing import

* Update wasabi pin

* Add missing import

* 💫 Refactor CLI (#2943)

To be merged into #2932.

## Description
- [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi)
- [x] use [`black`](https://github.com/ambv/black) for auto-formatting
- [x] add `flake8` config
- [x] move all messy UD-related scripts to `cli.ud`
- [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO)

### Types of change
enhancement

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Update wasabi pin

* Delete old test

* Update errors

* Fix typo

* Tidy up and format remaining code

* Fix formatting

* Improve formatting of messages

* Auto-format remaining code

* Add tok2vec stuff to spacy.train

* Fix typo

* Update wasabi pin

* Fix path checks for when train() is called as function

* Reformat and tidy up pretrain script

* Update argument annotations

* Raise error if model language doesn't match lang

* Document new train command

											
										
										
											2018-11-30 22:16:14 +03:00
+								        underscore (list): Optional list of string names of custom doc._.
 								        attributes. Attribute values need to be JSON-serializable. Values will
 								        be added to an "_" key in the data, e.g. "_": {"foo": "bar"}.
 								        RETURNS (dict): The data in spaCy's JSON format.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
 								        DOCS: https://spacy.io/api/doc#to_json
-												Update docstrings and API docs for Doc class

											
										
										
											2017-05-18 23:17:09 +03:00
+								        """
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        data = {"text": self.text}
-												💫 Improve Doc.to_json and add Doc.is_nered (#3381)

* Use default return instead of else

* Add Doc.is_nered to indicate if entities have been set

* Add properties in Doc.to_json if they were set, not if they're available

This way, if a processed Doc exports "pos": None, it means that the tag was explicitly unset. If it exports "ents": [], it means that entity annotations are available but that this document doesn't contain any entities. Before, this would have been unclear and problematic for training.

											
										
										
											2019-03-10 17:24:34 +03:00
+								        if self.is_nered:
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								            data["ents"] = [{"start": ent.start_char, "end": ent.end_char,
 								                            "label": ent.label_} for ent in self.ents]
-												💫 Improve Doc.to_json and add Doc.is_nered (#3381)

* Use default return instead of else

* Add Doc.is_nered to indicate if entities have been set

* Add properties in Doc.to_json if they were set, not if they're available

This way, if a processed Doc exports "pos": None, it means that the tag was explicitly unset. If it exports "ents": [], it means that entity annotations are available but that this document doesn't contain any entities. Before, this would have been unclear and problematic for training.

											
										
										
											2019-03-10 17:24:34 +03:00
+								        if self.is_sentenced:
 								            sents = list(self.sents)
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								            data["sents"] = [{"start": sent.start_char, "end": sent.end_char}
-												💫 New JSON helpers, training data internals & CLI rewrite (#2932)

* Support nowrap setting in util.prints

* Tidy up and fix whitespace

* Simplify script and use read_jsonl helper

* Add JSON schemas (see #2928)

* Deprecate Doc.print_tree

Will be replaced with Doc.to_json, which will produce a unified format

* Add Doc.to_json() method (see #2928)

Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space.

* Remove outdated test

* Add write_json and write_jsonl helpers

* WIP: Update spacy train

* Tidy up spacy train

* WIP: Use wasabi for formatting

* Add GoldParse helpers for JSON format

* WIP: add debug-data command

* Fix typo

* Add missing import

* Update wasabi pin

* Add missing import

* 💫 Refactor CLI (#2943)

To be merged into #2932.

## Description
- [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi)
- [x] use [`black`](https://github.com/ambv/black) for auto-formatting
- [x] add `flake8` config
- [x] move all messy UD-related scripts to `cli.ud`
- [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO)

### Types of change
enhancement

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Update wasabi pin

* Delete old test

* Update errors

* Fix typo

* Tidy up and format remaining code

* Fix formatting

* Improve formatting of messages

* Auto-format remaining code

* Add tok2vec stuff to spacy.train

* Fix typo

* Update wasabi pin

* Fix path checks for when train() is called as function

* Reformat and tidy up pretrain script

* Update argument annotations

* Raise error if model language doesn't match lang

* Document new train command

											
										
										
											2018-11-30 22:16:14 +03:00
+								                             for sent in sents]
 								        if self.cats:
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								            data["cats"] = self.cats
 								        data["tokens"] = []
-												💫 New JSON helpers, training data internals & CLI rewrite (#2932)

* Support nowrap setting in util.prints

* Tidy up and fix whitespace

* Simplify script and use read_jsonl helper

* Add JSON schemas (see #2928)

* Deprecate Doc.print_tree

Will be replaced with Doc.to_json, which will produce a unified format

* Add Doc.to_json() method (see #2928)

Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space.

* Remove outdated test

* Add write_json and write_jsonl helpers

* WIP: Update spacy train

* Tidy up spacy train

* WIP: Use wasabi for formatting

* Add GoldParse helpers for JSON format

* WIP: add debug-data command

* Fix typo

* Add missing import

* Update wasabi pin

* Add missing import

* 💫 Refactor CLI (#2943)

To be merged into #2932.

## Description
- [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi)
- [x] use [`black`](https://github.com/ambv/black) for auto-formatting
- [x] add `flake8` config
- [x] move all messy UD-related scripts to `cli.ud`
- [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO)

### Types of change
enhancement

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Update wasabi pin

* Delete old test

* Update errors

* Fix typo

* Tidy up and format remaining code

* Fix formatting

* Improve formatting of messages

* Auto-format remaining code

* Add tok2vec stuff to spacy.train

* Fix typo

* Update wasabi pin

* Fix path checks for when train() is called as function

* Reformat and tidy up pretrain script

* Update argument annotations

* Raise error if model language doesn't match lang

* Document new train command

											
										
										
											2018-11-30 22:16:14 +03:00
+								        for token in self:
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								            token_data = {"id": token.i, "start": token.idx, "end": token.idx + len(token)}
-												💫 Improve Doc.to_json and add Doc.is_nered (#3381)

* Use default return instead of else

* Add Doc.is_nered to indicate if entities have been set

* Add properties in Doc.to_json if they were set, not if they're available

This way, if a processed Doc exports "pos": None, it means that the tag was explicitly unset. If it exports "ents": [], it means that entity annotations are available but that this document doesn't contain any entities. Before, this would have been unclear and problematic for training.

											
										
										
											2019-03-10 17:24:34 +03:00
+								            if self.is_tagged:
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								                token_data["pos"] = token.pos_
 								                token_data["tag"] = token.tag_
-												💫 Improve Doc.to_json and add Doc.is_nered (#3381)

* Use default return instead of else

* Add Doc.is_nered to indicate if entities have been set

* Add properties in Doc.to_json if they were set, not if they're available

This way, if a processed Doc exports "pos": None, it means that the tag was explicitly unset. If it exports "ents": [], it means that entity annotations are available but that this document doesn't contain any entities. Before, this would have been unclear and problematic for training.

											
										
										
											2019-03-10 17:24:34 +03:00
+								            if self.is_parsed:
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								                token_data["dep"] = token.dep_
 								                token_data["head"] = token.head.i
 								            data["tokens"].append(token_data)
-												💫 New JSON helpers, training data internals & CLI rewrite (#2932)

* Support nowrap setting in util.prints

* Tidy up and fix whitespace

* Simplify script and use read_jsonl helper

* Add JSON schemas (see #2928)

* Deprecate Doc.print_tree

Will be replaced with Doc.to_json, which will produce a unified format

* Add Doc.to_json() method (see #2928)

Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space.

* Remove outdated test

* Add write_json and write_jsonl helpers

* WIP: Update spacy train

* Tidy up spacy train

* WIP: Use wasabi for formatting

* Add GoldParse helpers for JSON format

* WIP: add debug-data command

* Fix typo

* Add missing import

* Update wasabi pin

* Add missing import

* 💫 Refactor CLI (#2943)

To be merged into #2932.

## Description
- [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi)
- [x] use [`black`](https://github.com/ambv/black) for auto-formatting
- [x] add `flake8` config
- [x] move all messy UD-related scripts to `cli.ud`
- [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO)

### Types of change
enhancement

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Update wasabi pin

* Delete old test

* Update errors

* Fix typo

* Tidy up and format remaining code

* Fix formatting

* Improve formatting of messages

* Auto-format remaining code

* Add tok2vec stuff to spacy.train

* Fix typo

* Update wasabi pin

* Fix path checks for when train() is called as function

* Reformat and tidy up pretrain script

* Update argument annotations

* Raise error if model language doesn't match lang

* Document new train command

											
										
										
											2018-11-30 22:16:14 +03:00
+								        if underscore:
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								            data["_"] = {}
-												💫 New JSON helpers, training data internals & CLI rewrite (#2932)

* Support nowrap setting in util.prints

* Tidy up and fix whitespace

* Simplify script and use read_jsonl helper

* Add JSON schemas (see #2928)

* Deprecate Doc.print_tree

Will be replaced with Doc.to_json, which will produce a unified format

* Add Doc.to_json() method (see #2928)

Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space.

* Remove outdated test

* Add write_json and write_jsonl helpers

* WIP: Update spacy train

* Tidy up spacy train

* WIP: Use wasabi for formatting

* Add GoldParse helpers for JSON format

* WIP: add debug-data command

* Fix typo

* Add missing import

* Update wasabi pin

* Add missing import

* 💫 Refactor CLI (#2943)

To be merged into #2932.

## Description
- [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi)
- [x] use [`black`](https://github.com/ambv/black) for auto-formatting
- [x] add `flake8` config
- [x] move all messy UD-related scripts to `cli.ud`
- [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO)

### Types of change
enhancement

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Update wasabi pin

* Delete old test

* Update errors

* Fix typo

* Tidy up and format remaining code

* Fix formatting

* Improve formatting of messages

* Auto-format remaining code

* Add tok2vec stuff to spacy.train

* Fix typo

* Update wasabi pin

* Fix path checks for when train() is called as function

* Reformat and tidy up pretrain script

* Update argument annotations

* Raise error if model language doesn't match lang

* Document new train command

											
										
										
											2018-11-30 22:16:14 +03:00
+								            for attr in underscore:
 								                if not self.has_extension(attr):
 								                    raise ValueError(Errors.E106.format(attr=attr, opts=underscore))
 								                value = self._.get(attr)
-												💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)

Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉

See here: https://github.com/explosion/srsly

    Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.

    At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.

    srsly currently includes forks of the following packages:

        ujson
        msgpack
        msgpack-numpy
        cloudpickle



* WIP: replace json/ujson with srsly

* Replace ujson in examples

Use regular json instead of srsly to make code easier to read and follow

* Update requirements

* Fix imports

* Fix typos

* Replace msgpack with srsly

* Fix warning

											
										
										
											2018-12-03 03:28:22 +03:00
+								                if not srsly.is_json_serializable(value):
-												💫 New JSON helpers, training data internals & CLI rewrite (#2932)

* Support nowrap setting in util.prints

* Tidy up and fix whitespace

* Simplify script and use read_jsonl helper

* Add JSON schemas (see #2928)

* Deprecate Doc.print_tree

Will be replaced with Doc.to_json, which will produce a unified format

* Add Doc.to_json() method (see #2928)

Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space.

* Remove outdated test

* Add write_json and write_jsonl helpers

* WIP: Update spacy train

* Tidy up spacy train

* WIP: Use wasabi for formatting

* Add GoldParse helpers for JSON format

* WIP: add debug-data command

* Fix typo

* Add missing import

* Update wasabi pin

* Add missing import

* 💫 Refactor CLI (#2943)

To be merged into #2932.

## Description
- [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi)
- [x] use [`black`](https://github.com/ambv/black) for auto-formatting
- [x] add `flake8` config
- [x] move all messy UD-related scripts to `cli.ud`
- [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO)

### Types of change
enhancement

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Update wasabi pin

* Delete old test

* Update errors

* Fix typo

* Tidy up and format remaining code

* Fix formatting

* Improve formatting of messages

* Auto-format remaining code

* Add tok2vec stuff to spacy.train

* Fix typo

* Update wasabi pin

* Fix path checks for when train() is called as function

* Reformat and tidy up pretrain script

* Update argument annotations

* Raise error if model language doesn't match lang

* Document new train command

											
										
										
											2018-11-30 22:16:14 +03:00
+								                    raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								                data["_"][attr] = value
-												💫 New JSON helpers, training data internals & CLI rewrite (#2932)

* Support nowrap setting in util.prints

* Tidy up and fix whitespace

* Simplify script and use read_jsonl helper

* Add JSON schemas (see #2928)

* Deprecate Doc.print_tree

Will be replaced with Doc.to_json, which will produce a unified format

* Add Doc.to_json() method (see #2928)

Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space.

* Remove outdated test

* Add write_json and write_jsonl helpers

* WIP: Update spacy train

* Tidy up spacy train

* WIP: Use wasabi for formatting

* Add GoldParse helpers for JSON format

* WIP: add debug-data command

* Fix typo

* Add missing import

* Update wasabi pin

* Add missing import

* 💫 Refactor CLI (#2943)

To be merged into #2932.

## Description
- [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi)
- [x] use [`black`](https://github.com/ambv/black) for auto-formatting
- [x] add `flake8` config
- [x] move all messy UD-related scripts to `cli.ud`
- [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO)

### Types of change
enhancement

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Update wasabi pin

* Delete old test

* Update errors

* Fix typo

* Tidy up and format remaining code

* Fix formatting

* Improve formatting of messages

* Auto-format remaining code

* Add tok2vec stuff to spacy.train

* Fix typo

* Update wasabi pin

* Fix path checks for when train() is called as function

* Reformat and tidy up pretrain script

* Update argument annotations

* Raise error if model language doesn't match lang

* Document new train command

											
										
										
											2018-11-30 22:16:14 +03:00
+								        return data
-												move parse_tree logic to a new tokens/printers.py file

											
										
										
											2016-12-30 20:19:18 +03:00
-												Add method to export utf8 array to Doc

											
										
										
											2019-03-09 14:50:27 +03:00
+								    def to_utf8_array(self, int nr_char=-1):
 								        """Encode word strings to utf8, and export to a fixed-width array
 								        of characters. Characters are placed into the array in the order:
 , -1, 1, -2, etc
 								        For example, if the array is sliced array[:, :8], the array will
 								        contain the first 4 characters and last 4 characters of each word ---
 								        with the middle characters clipped out. The value 255 is used as a pad
 								        value.
 								        """
 								        byte_strings = [token.orth_.encode('utf8') for token in self]
 								        if nr_char == -1:
 								            nr_char = max(len(bs) for bs in byte_strings)
 								        cdef np.ndarray output = numpy.zeros((len(byte_strings), nr_char), dtype='uint8')
 								        output.fill(255)
 								        cdef int i, j, start_idx, end_idx
 								        cdef bytes byte_string
 								        cdef unsigned char utf8_char
 								        for i, byte_string in enumerate(byte_strings):
 								            j = 0
 								            start_idx = 0
 								            end_idx = len(byte_string) - 1
 								            while j < nr_char and start_idx <= end_idx:
 								                output[i, j] = <unsigned char>byte_string[start_idx]
 								                start_idx += 1
 								                j += 1
 								                if j < nr_char and start_idx <= end_idx:
 								                    output[i, j] = <unsigned char>byte_string[end_idx]
 								                    end_idx -= 1
 								                    j += 1
 								        return output
-												* Add noun_chunks iterator, and fix left/right child setting in Doc.merge

											
										
										
											2015-07-30 03:29:49 +03:00
-												* Rework the Span-merge patch, to avoid extending the interface of Doc, and avoid virtualizing the Span.start and Span.end indices, to keep Span usage efficient

											
										
										
											2015-11-07 00:55:34 +03:00
+								cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
-												Allow Doc.char_span to snap to token boundaries (#5849)

* Allow Doc.char_span to snap to token boundaries

Add a `mode` option to allow `Doc.char_span` to snap to token
boundaries. The `mode` options:

* `strict`: character offsets must match token boundaries (default, same as
before)
* `inside`: all tokens completely within the character span
* `outside`: all tokens at least partially covered by the character span

Add a new helper function `token_by_char` that returns the token
corresponding to a character position in the text. Update
`token_by_start` and `token_by_end` to use `token_by_char` for more
efficient searching.

* Remove unused import

* Rename mode to alignment_mode

Rename `mode` to `alignment_mode` with the options
`strict`/`contract`/`expand`. Any unrecognized modes are silently
converted to `strict`.
											
										
										
											2020-08-04 14:36:32 +03:00
+								    cdef int i = token_by_char(tokens, length, start_char)
 								    if i >= 0 and tokens[i].idx == start_char:
 								        return i
-												* Rework the Span-merge patch, to avoid extending the interface of Doc, and avoid virtualizing the Span.start and Span.end indices, to keep Span usage efficient

											
										
										
											2015-11-07 00:55:34 +03:00
+								    else:
 								        return -1
 								cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2:
-												Allow Doc.char_span to snap to token boundaries (#5849)

* Allow Doc.char_span to snap to token boundaries

Add a `mode` option to allow `Doc.char_span` to snap to token
boundaries. The `mode` options:

* `strict`: character offsets must match token boundaries (default, same as
before)
* `inside`: all tokens completely within the character span
* `outside`: all tokens at least partially covered by the character span

Add a new helper function `token_by_char` that returns the token
corresponding to a character position in the text. Update
`token_by_start` and `token_by_end` to use `token_by_char` for more
efficient searching.

* Remove unused import

* Rename mode to alignment_mode

Rename `mode` to `alignment_mode` with the options
`strict`/`contract`/`expand`. Any unrecognized modes are silently
converted to `strict`.
											
										
										
											2020-08-04 14:36:32 +03:00
+								    # end_char is exclusive, so find the token at one char before
 								    cdef int i = token_by_char(tokens, length, end_char - 1)
 								    if i >= 0 and tokens[i].idx + tokens[i].lex.length == end_char:
 								        return i
-												* Rework the Span-merge patch, to avoid extending the interface of Doc, and avoid virtualizing the Span.start and Span.end indices, to keep Span usage efficient

											
										
										
											2015-11-07 00:55:34 +03:00
+								    else:
 								        return -1
-												Allow Doc.char_span to snap to token boundaries (#5849)

* Allow Doc.char_span to snap to token boundaries

Add a `mode` option to allow `Doc.char_span` to snap to token
boundaries. The `mode` options:

* `strict`: character offsets must match token boundaries (default, same as
before)
* `inside`: all tokens completely within the character span
* `outside`: all tokens at least partially covered by the character span

Add a new helper function `token_by_char` that returns the token
corresponding to a character position in the text. Update
`token_by_start` and `token_by_end` to use `token_by_char` for more
efficient searching.

* Remove unused import

* Rename mode to alignment_mode

Rename `mode` to `alignment_mode` with the options
`strict`/`contract`/`expand`. Any unrecognized modes are silently
converted to `strict`.
											
										
										
											2020-08-04 14:36:32 +03:00
+								cdef int token_by_char(const TokenC* tokens, int length, int char_idx) except -2:
 								    cdef int start = 0, mid, end = length - 1
 								    while start <= end:
 								        mid = (start + end) / 2
 								        if char_idx < tokens[mid].idx:
 								            end = mid - 1
 								        elif char_idx >= tokens[mid].idx + tokens[mid].lex.length + tokens[mid].spacy:
 								            start = mid + 1
 								        else:
 								            return mid
 								    return -1
-												* Add noun_chunks iterator, and fix left/right child setting in Doc.merge

											
										
										
											2015-07-30 03:29:49 +03:00
+								cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
 								    cdef TokenC* head
 								    cdef TokenC* child
 								    cdef int i
-												* Fix Issue #122: Incorrect calculation of children after Doc.merge()

											
										
										
											2015-10-18 09:17:27 +03:00
+								    # Set number of left/right children to 0. We'll increment it in the loops.
 								    for i in range(length):
 								        tokens[i].l_kids = 0
 								        tokens[i].r_kids = 0
 								        tokens[i].l_edge = i
 								        tokens[i].r_edge = i
-												Iterate over lr_edges until sents are correct (#4702)

Iterate over lr_edges until all heads are within the current sentence.
Instead of iterating over them for a fixed number of iterations, check
whether the sentence boundaries are correct for the heads and stop when
all are correct. Stop after a maximum of 10 iterations, providing a
warning in this case since the sentence boundaries may not be correct.
											
										
										
											2019-11-25 15:06:36 +03:00
+								    cdef int loop_count = 0
 								    cdef bint heads_within_sents = False
 								    # Try up to 10 iterations of adjusting lr_kids and lr_edges in order to
 								    # handle non-projective dependency parses, stopping when all heads are
 								    # within their respective sentence boundaries. We have documented cases
 								    # that need at least 4 iterations, so this is to be on the safe side
 								    # without risking getting stuck in an infinite loop if something is
 								    # terribly malformed.
 								    while not heads_within_sents:
 								        heads_within_sents = _set_lr_kids_and_edges(tokens, length, loop_count)
 								        if loop_count > 10:
-												Simplify warnings

											
										
										
											2020-04-28 14:37:37 +03:00
+								            warnings.warn(Warnings.W026)
-												Break out of infinite loop as intended (#5077)


											
										
										
											2020-03-03 14:29:05 +03:00
+								            break
-												Iterate over lr_edges until sents are correct (#4702)

Iterate over lr_edges until all heads are within the current sentence.
Instead of iterating over them for a fixed number of iterations, check
whether the sentence boundaries are correct for the heads and stop when
all are correct. Stop after a maximum of 10 iterations, providing a
warning in this case since the sentence boundaries may not be correct.
											
										
										
											2019-11-25 15:06:36 +03:00
+								        loop_count += 1
-												* Fix tag handling in doc.merge, and assign sent_start when setting heads.

											
										
										
											2015-11-03 10:14:53 +03:00
+								    # Set sentence starts
 								    for i in range(length):
 								        if tokens[i].head == 0 and tokens[i].dep != 0:
 								            tokens[tokens[i].l_edge].sent_start = True
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
-												Fix Doc pickling. This also removes need for Binder class

											
										
										
											2017-10-17 17:11:13 +03:00
-												Iterate over lr_edges until sents are correct (#4702)

Iterate over lr_edges until all heads are within the current sentence.
Instead of iterating over them for a fixed number of iterations, check
whether the sentence boundaries are correct for the heads and stop when
all are correct. Stop after a maximum of 10 iterations, providing a
warning in this case since the sentence boundaries may not be correct.
											
										
										
											2019-11-25 15:06:36 +03:00
+								cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) except -1:
 								    # May be called multiple times due to non-projectivity. See issues #3170
 								    # and #4688.
 								    # Set left edges
 								    cdef TokenC* head
 								    cdef TokenC* child
 								    cdef int i, j
 								    for i in range(length):
 								        child = &tokens[i]
 								        head = &tokens[i + child.head]
 								        if child < head and loop_count == 0:
 								            head.l_kids += 1
 								        if child.l_edge < head.l_edge:
 								            head.l_edge = child.l_edge
 								        if child.r_edge > head.r_edge:
 								            head.r_edge = child.r_edge
 								    # Set right edges - same as above, but iterate in reverse
 								    for i in range(length-1, -1, -1):
 								        child = &tokens[i]
 								        head = &tokens[i + child.head]
 								        if child > head and loop_count == 0:
 								            head.r_kids += 1
 								        if child.r_edge > head.r_edge:
 								            head.r_edge = child.r_edge
 								        if child.l_edge < head.l_edge:
 								            head.l_edge = child.l_edge
 								    # Get sentence start positions according to current state
 								    sent_starts = set()
 								    for i in range(length):
 								        if tokens[i].head == 0 and tokens[i].dep != 0:
 								            sent_starts.add(tokens[i].l_edge)
 								    cdef int curr_sent_start = 0
 								    cdef int curr_sent_end = 0
 								    # Check whether any heads are not within the current sentence
 								    for i in range(length):
 								        if (i > 0 and i in sent_starts) or i == length - 1:
 								            curr_sent_end = i
 								            for j in range(curr_sent_start, curr_sent_end):
 								                if tokens[j].head + j < curr_sent_start or tokens[j].head + j >= curr_sent_end + 1:
 								                    return False
 								            curr_sent_start = i
 								    return True
-												Fix issue 2396 (#3089)

* Test on #2396: bug in Doc.get_lca_matrix()

* reimplementation of Doc.get_lca_matrix(), (closes #2396)

* reimplement Span.get_lca_matrix(), and call it from Doc.get_lca_matrix()

* tests Span.get_lca_matrix() as well as Doc.get_lca_matrix()

* implement _get_lca_matrix as a helper function in doc.pyx; call it from Doc.get_lca_matrix and Span.get_lca_matrix

* use memory view instead of np.ndarray in _get_lca_matrix (faster)

* fix bug when calling Span.get_lca_matrix; return lca matrix as np.array instead of memoryview

* cleaner conditional, add comment

											
										
										
											2018-12-29 20:02:26 +03:00
+								cdef int _get_tokens_lca(Token token_j, Token token_k):
 								    """Given two tokens, returns the index of the lowest common ancestor
 								    (LCA) among the two. If they have no common ancestor, -1 is returned.
 								    token_j (Token): a token.
 								    token_k (Token): another token.
 								    RETURNS (int): index of lowest common ancestor, or -1 if the tokens
 								        have no common ancestor.
 								    """
 								    if token_j == token_k:
 								        return token_j.i
 								    elif token_j.head == token_k:
 								        return token_k.i
 								    elif token_k.head == token_j:
 								        return token_j.i
 								    token_j_ancestors = set(token_j.ancestors)
 								    if token_k in token_j_ancestors:
 								        return token_k.i
 								    for token_k_ancestor in token_k.ancestors:
 								        if token_k_ancestor == token_j:
 								            return token_j.i
 								        if token_k_ancestor in token_j_ancestors:
 								            return token_k_ancestor.i
 								    return -1
 								cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
 								    """Given a doc and a start and end position defining a set of contiguous
 								    tokens within it, returns a matrix of Lowest Common Ancestors (LCA), where
 								    LCA[i, j] is the index of the lowest common ancestor among token i and j.
 								    If the tokens have no common ancestor within the specified span,
 								    LCA[i, j] will be -1.
 								    doc (Doc): The index of the token, or the slice of the document
 								    start (int): First token to be included in the LCA matrix.
 								    end (int): Position of next to last token included in the LCA matrix.
 								    RETURNS (int [:, :]): memoryview of numpy.array[ndim=2, dtype=numpy.int32],
 								        with shape (n, n), where n = len(doc).
 								    """
 								    cdef int [:,:] lca_matrix
 								    n_tokens= end - start
-												Bugfix/get lca matrix (#3110)

This PR adds a test for an untested case of `Span.get_lca_matrix`, and fixes a bug for that scenario, which I introduced in [this PR](https://github.com/explosion/spaCy/pull/3089) (sorry!).

## Description
The previous implementation of get_lca_matrix was failing for the case `doc[j:k].get_lca_matrix()` where `j > 0`. A test has been added for this case and the bug has been fixed.

### Types of change
Bug fix

## Checklist

- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-01-06 21:07:50 +03:00
+								    lca_mat = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32)
 								    lca_mat.fill(-1)
 								    lca_matrix = lca_mat
 								    for j in range(n_tokens):
 								        token_j = doc[start + j]
-												Fix issue 2396 (#3089)

* Test on #2396: bug in Doc.get_lca_matrix()

* reimplementation of Doc.get_lca_matrix(), (closes #2396)

* reimplement Span.get_lca_matrix(), and call it from Doc.get_lca_matrix()

* tests Span.get_lca_matrix() as well as Doc.get_lca_matrix()

* implement _get_lca_matrix as a helper function in doc.pyx; call it from Doc.get_lca_matrix and Span.get_lca_matrix

* use memory view instead of np.ndarray in _get_lca_matrix (faster)

* fix bug when calling Span.get_lca_matrix; return lca matrix as np.array instead of memoryview

* cleaner conditional, add comment

											
										
										
											2018-12-29 20:02:26 +03:00
+								        # the common ancestor of token and itself is itself:
 								        lca_matrix[j, j] = j
-												Bugfix/get lca matrix (#3110)

This PR adds a test for an untested case of `Span.get_lca_matrix`, and fixes a bug for that scenario, which I introduced in [this PR](https://github.com/explosion/spaCy/pull/3089) (sorry!).

## Description
The previous implementation of get_lca_matrix was failing for the case `doc[j:k].get_lca_matrix()` where `j > 0`. A test has been added for this case and the bug has been fixed.

### Types of change
Bug fix

## Checklist

- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-01-06 21:07:50 +03:00
+								        # we will only iterate through tokens in the same sentence
 								        sent = token_j.sent
 								        sent_start = sent.start
 								        j_idx_in_sent = start + j - sent_start
 								        n_missing_tokens_in_sent = len(sent) - j_idx_in_sent
 								        # make sure we do not go past `end`, in cases where `end` < sent.end
-												Fix range in Span.get_lca_matrix (#8115)

Fix the adjusted token index / lca matrix index ranges for
`_get_lca_matrix` for spans.

* The range for `k` should correspond to the adjusted indices in
`lca_matrix` with the `start` indexed at `0`
											
										
										
											2021-05-17 17:54:10 +03:00
+								        max_range = min(j + n_missing_tokens_in_sent, end - start)
-												Bugfix/get lca matrix (#3110)

This PR adds a test for an untested case of `Span.get_lca_matrix`, and fixes a bug for that scenario, which I introduced in [this PR](https://github.com/explosion/spaCy/pull/3089) (sorry!).

## Description
The previous implementation of get_lca_matrix was failing for the case `doc[j:k].get_lca_matrix()` where `j > 0`. A test has been added for this case and the bug has been fixed.

### Types of change
Bug fix

## Checklist

- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-01-06 21:07:50 +03:00
+								        for k in range(j + 1, max_range):
 								            lca = _get_tokens_lca(token_j, doc[start + k])
-												Fix issue 2396 (#3089)

* Test on #2396: bug in Doc.get_lca_matrix()

* reimplementation of Doc.get_lca_matrix(), (closes #2396)

* reimplement Span.get_lca_matrix(), and call it from Doc.get_lca_matrix()

* tests Span.get_lca_matrix() as well as Doc.get_lca_matrix()

* implement _get_lca_matrix as a helper function in doc.pyx; call it from Doc.get_lca_matrix and Span.get_lca_matrix

* use memory view instead of np.ndarray in _get_lca_matrix (faster)

* fix bug when calling Span.get_lca_matrix; return lca matrix as np.array instead of memoryview

* cleaner conditional, add comment

											
										
										
											2018-12-29 20:02:26 +03:00
+								            # if lca is outside of span, we set it to -1
 								            if not start <= lca < end:
 								                lca_matrix[j, k] = -1
 								                lca_matrix[k, j] = -1
 								            else:
-												Bugfix/get lca matrix (#3110)

This PR adds a test for an untested case of `Span.get_lca_matrix`, and fixes a bug for that scenario, which I introduced in [this PR](https://github.com/explosion/spaCy/pull/3089) (sorry!).

## Description
The previous implementation of get_lca_matrix was failing for the case `doc[j:k].get_lca_matrix()` where `j > 0`. A test has been added for this case and the bug has been fixed.

### Types of change
Bug fix

## Checklist

- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-01-06 21:07:50 +03:00
+								                lca_matrix[j, k] = lca - start
 								                lca_matrix[k, j] = lca - start
-												Fix issue 2396 (#3089)

* Test on #2396: bug in Doc.get_lca_matrix()

* reimplementation of Doc.get_lca_matrix(), (closes #2396)

* reimplement Span.get_lca_matrix(), and call it from Doc.get_lca_matrix()

* tests Span.get_lca_matrix() as well as Doc.get_lca_matrix()

* implement _get_lca_matrix as a helper function in doc.pyx; call it from Doc.get_lca_matrix and Span.get_lca_matrix

* use memory view instead of np.ndarray in _get_lca_matrix (faster)

* fix bug when calling Span.get_lca_matrix; return lca matrix as np.array instead of memoryview

* cleaner conditional, add comment

											
										
										
											2018-12-29 20:02:26 +03:00
+								    return lca_matrix
-												Fix Doc pickling. This also removes need for Binder class

											
										
										
											2017-10-17 17:11:13 +03:00
+								def pickle_doc(doc):
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								    bytes_data = doc.to_bytes(exclude=["vocab", "user_data"])
-												Make doc pickling support hooks

											
										
										
											2017-10-17 20:44:09 +03:00
+								    hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks,
 								                      doc.user_token_hooks)
-												Fix removabl of dill (for srsly)

											
										
										
											2018-12-06 20:46:09 +03:00
+								    return (unpickle_doc, (doc.vocab, srsly.pickle_dumps(hooks_and_data), bytes_data))
-												Fix Doc pickling. This also removes need for Binder class

											
										
										
											2017-10-17 17:11:13 +03:00
-												Make doc pickling support hooks

											
										
										
											2017-10-17 20:44:09 +03:00
+								def unpickle_doc(vocab, hooks_and_data, bytes_data):
-												Fix removabl of dill (for srsly)

											
										
										
											2018-12-06 20:46:09 +03:00
+								    user_data, doc_hooks, span_hooks, token_hooks = srsly.pickle_loads(hooks_and_data)
-												Tidy up and document Doc, Token and Span

											
										
										
											2017-10-27 16:41:45 +03:00
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								    doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data, exclude=["user_data"])
-												Make doc pickling support hooks

											
										
										
											2017-10-17 20:44:09 +03:00
+								    doc.user_hooks.update(doc_hooks)
 								    doc.user_span_hooks.update(span_hooks)
 								    doc.user_token_hooks.update(token_hooks)
-												Fix Doc pickling. This also removes need for Binder class

											
										
										
											2017-10-17 17:11:13 +03:00
+								    return doc
 								copy_reg.pickle(Doc, pickle_doc, unpickle_doc)
-												💫 Port master changes over to develop (#2979)

* Create aryaprabhudesai.md (#2681)

* Update _install.jade (#2688)

Typo fix: "models" -> "model"

* Add FAC to spacy.explain (resolves #2706)

* Remove docstrings for deprecated arguments (see #2703)

* When calling getoption() in conftest.py, pass a default option (#2709)

* When calling getoption() in conftest.py, pass a default option

This is necessary to allow testing an installed spacy by running:

  pytest --pyargs spacy

* Add contributor agreement

* update bengali token rules for hyphen and digits (#2731)

* Less norm computations in token similarity (#2730)

* Less norm computations in token similarity

* Contributor agreement

* Remove ')' for clarity (#2737)

Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.

* added contributor agreement for mbkupfer (#2738)

* Basic support for Telugu language (#2751)

* Lex _attrs for polish language (#2750)

* Signed spaCy contributor agreement

* Added polish version of english lex_attrs

* Introduces a bulk merge function, in order to solve issue #653 (#2696)

* Fix comment

* Introduce bulk merge to increase performance on many span merges

* Sign contributor agreement

* Implement pull request suggestions

* Describe converters more explicitly (see #2643)

* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]

* Fix formatting

* Fix dependency scheme docs (closes #2705) [ci skip]

* Don't set stop word in example (closes #2657) [ci skip]

* Add words to portuguese language _num_words (#2759)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Update Indonesian model (#2752)

* adding e-KTP in tokenizer exceptions list

* add exception token

* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception

* add tokenizer exceptions list

* combining base_norms with norm_exceptions

* adding norm_exception

* fix double key in lemmatizer

* remove unused import on punctuation.py

* reformat stop_words to reduce number of lines, improve readibility

* updating tokenizer exception

* implement is_currency for lang/id

* adding orth_first_upper in tokenizer_exceptions

* update the norm_exception list

* remove bunch of abbreviations

* adding contributors file

* Fixed spaCy+Keras example (#2763)

* bug fixes in keras example

* created contributor agreement

* Adding French hyphenated first name (#2786)

* Fix typo (closes #2784)

* Fix typo (#2795) [ci skip]

Fixed typo on line 6 "regcognizer --> recognizer"

* Adding basic support for Sinhala language. (#2788)

* adding Sinhala language package, stop words, examples and lex_attrs.

* Adding contributor agreement

* Updating contributor agreement

* Also include lowercase norm exceptions

* Fix error (#2802)

* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way.  Use the resize function

* added spaCy Contributor Agreement

* Add charlax's contributor agreement (#2805)

* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)

* Contributors agreement

* Contributors agreement

* Contributors agreement

* Add jupyter=True to displacy.render in documentation (#2806)

* Revert "Also include lowercase norm exceptions"

This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.

* Remove deprecated encoding argument to msgpack

* Set up dependency tree pattern matching skeleton (#2732)

* Fix bug when too many entity types. Fixes #2800

* Fix Python 2 test failure

* Require older msgpack-numpy

* Restore encoding arg on msgpack-numpy

* Try to fix version pin for msgpack-numpy

* Update Portuguese Language (#2790)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols

* Extended punctuation and norm_exceptions in the Portuguese language

* Correct error in spacy universe docs concerning spacy-lookup (#2814)

* Update Keras Example for (Parikh et al, 2016) implementation  (#2803)

* bug fixes in keras example

* created contributor agreement

* baseline for Parikh model

* initial version of parikh 2016 implemented

* tested asymmetric models

* fixed grevious error in normalization

* use standard SNLI test file

* begin to rework parikh example

* initial version of running example

* start to document the new version

* start to document the new version

* Update Decompositional Attention.ipynb

* fixed calls to similarity

* updated the README

* import sys package duh

* simplified indexing on mapping word to IDs

* stupid python indent error

* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround

* Fix typo (closes #2815) [ci skip]

* Update regex version dependency

* Set version to 2.0.13.dev3

* Skip seemingly problematic test

* Remove problematic test

* Try previous version of regex

* Revert "Remove problematic test"

This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.

* Unskip test

* Try older version of regex

* 💫 Update training examples and use minibatching (#2830)

<!--- Provide a general summary of your changes in the title. -->

## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.

### Types of change
enhancements

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Visual C++ link updated (#2842) (closes #2841) [ci skip]

* New landing page

* Add contribution agreement

* Correcting lang/ru/examples.py (#2845)

* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement

* Correct some grammatical inaccuracies in lang\ru\examples.py

* Move contributor agreement to separate file

* Set version to 2.0.13.dev4

* Add Persian(Farsi) language support (#2797)

* Also include lowercase norm exceptions

* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors

* Rule-based French Lemmatizer (#2818)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech 
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Set version to 2.0.13

* Fix formatting and consistency

* Update docs for new version [ci skip]

* Increment version [ci skip]

* Add info on wheels [ci skip]

* Adding "This is a sentence" example to Sinhala (#2846)

* Add wheels badge

* Update badge [ci skip]

* Update README.rst [ci skip]

* Update murmurhash pin

* Increment version to 2.0.14.dev0

* Update GPU docs for v2.0.14

* Add wheel to setup_requires

* Import prefer_gpu and require_gpu functions from Thinc

* Add tests for prefer_gpu() and require_gpu()

* Update requirements and setup.py

* Workaround bug in thinc require_gpu

* Set version to v2.0.14

* Update push-tag script

* Unhack prefer_gpu

* Require thinc 6.10.6

* Update prefer_gpu and require_gpu docs [ci skip]

* Fix specifiers for GPU

* Set version to 2.0.14.dev1

* Set version to 2.0.14

* Update Thinc version pin

* Increment version

* Fix msgpack-numpy version pin

* Increment version

* Update version to 2.0.16

* Update version [ci skip]

* Redundant ')' in the Stop words' example (#2856)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Documentation improvement regarding joblib and SO (#2867)

Some documentation improvements

## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)

### Types of change
Documentation

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* raise error when setting overlapping entities as doc.ents (#2880)

* Fix out-of-bounds access in NER training

The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!

This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.

* Change PyThaiNLP Url (#2876)

* Fix missing comma

* Add example showing a fix-up rule for space entities

* Set version to 2.0.17.dev0

* Update regex version

* Revert "Update regex version"

This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.

* Try setting older regex version, to align with conda

* Set version to 2.0.17

* Add spacy-js to universe [ci-skip]

* Add spacy-raspberry to universe (closes #2889)

* Add script to validate universe json [ci skip]

* Removed space in docs + added contributor indo (#2909)

* - removed unneeded space in documentation

* - added contributor info

* Allow input text of length up to max_length, inclusive (#2922)

* Include universe spec for spacy-wordnet component (#2919)

* feat: include universe spec for spacy-wordnet component

* chore: include spaCy contributor agreement

* Minor formatting changes [ci skip]

* Fix image [ci skip]

Twitter URL doesn't work on live site

* Check if the word is in one of the regular lists specific to each POS (#2886)

* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)

Resolves #2924.

## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)

### Types of change
bug fix

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix typo [ci skip]

* fixes symbolic link on py3 and windows (#2949)

* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948

* Update spacy/compat.py

Co-Authored-By: cicorias <cicorias@users.noreply.github.com>

* Fix formatting

* Update universe [ci skip]

* Catalan Language Support (#2940)

* Catalan language Support

* Ddding Catalan to documentation

* Sort languages alphabetically [ci skip]

* Update tests for pytest 4.x (#2965)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix regex pin to harmonize with conda (#2964)

* Update README.rst

* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)

Fixes #2976

* Fix typo

* Fix typo

* Remove duplicate file

* Require thinc 7.0.0.dev2

Fixes bug in gpu_ops that would use cupy instead of numpy on CPU

* Add missing import

* Fix error IDs

* Fix tests

											
										
										
											2018-11-29 18:30:29 +03:00
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
-												💫 Port master changes over to develop (#2979)

* Create aryaprabhudesai.md (#2681)

* Update _install.jade (#2688)

Typo fix: "models" -> "model"

* Add FAC to spacy.explain (resolves #2706)

* Remove docstrings for deprecated arguments (see #2703)

* When calling getoption() in conftest.py, pass a default option (#2709)

* When calling getoption() in conftest.py, pass a default option

This is necessary to allow testing an installed spacy by running:

  pytest --pyargs spacy

* Add contributor agreement

* update bengali token rules for hyphen and digits (#2731)

* Less norm computations in token similarity (#2730)

* Less norm computations in token similarity

* Contributor agreement

* Remove ')' for clarity (#2737)

Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.

* added contributor agreement for mbkupfer (#2738)

* Basic support for Telugu language (#2751)

* Lex _attrs for polish language (#2750)

* Signed spaCy contributor agreement

* Added polish version of english lex_attrs

* Introduces a bulk merge function, in order to solve issue #653 (#2696)

* Fix comment

* Introduce bulk merge to increase performance on many span merges

* Sign contributor agreement

* Implement pull request suggestions

* Describe converters more explicitly (see #2643)

* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]

* Fix formatting

* Fix dependency scheme docs (closes #2705) [ci skip]

* Don't set stop word in example (closes #2657) [ci skip]

* Add words to portuguese language _num_words (#2759)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Update Indonesian model (#2752)

* adding e-KTP in tokenizer exceptions list

* add exception token

* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception

* add tokenizer exceptions list

* combining base_norms with norm_exceptions

* adding norm_exception

* fix double key in lemmatizer

* remove unused import on punctuation.py

* reformat stop_words to reduce number of lines, improve readibility

* updating tokenizer exception

* implement is_currency for lang/id

* adding orth_first_upper in tokenizer_exceptions

* update the norm_exception list

* remove bunch of abbreviations

* adding contributors file

* Fixed spaCy+Keras example (#2763)

* bug fixes in keras example

* created contributor agreement

* Adding French hyphenated first name (#2786)

* Fix typo (closes #2784)

* Fix typo (#2795) [ci skip]

Fixed typo on line 6 "regcognizer --> recognizer"

* Adding basic support for Sinhala language. (#2788)

* adding Sinhala language package, stop words, examples and lex_attrs.

* Adding contributor agreement

* Updating contributor agreement

* Also include lowercase norm exceptions

* Fix error (#2802)

* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way.  Use the resize function

* added spaCy Contributor Agreement

* Add charlax's contributor agreement (#2805)

* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)

* Contributors agreement

* Contributors agreement

* Contributors agreement

* Add jupyter=True to displacy.render in documentation (#2806)

* Revert "Also include lowercase norm exceptions"

This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.

* Remove deprecated encoding argument to msgpack

* Set up dependency tree pattern matching skeleton (#2732)

* Fix bug when too many entity types. Fixes #2800

* Fix Python 2 test failure

* Require older msgpack-numpy

* Restore encoding arg on msgpack-numpy

* Try to fix version pin for msgpack-numpy

* Update Portuguese Language (#2790)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols

* Extended punctuation and norm_exceptions in the Portuguese language

* Correct error in spacy universe docs concerning spacy-lookup (#2814)

* Update Keras Example for (Parikh et al, 2016) implementation  (#2803)

* bug fixes in keras example

* created contributor agreement

* baseline for Parikh model

* initial version of parikh 2016 implemented

* tested asymmetric models

* fixed grevious error in normalization

* use standard SNLI test file

* begin to rework parikh example

* initial version of running example

* start to document the new version

* start to document the new version

* Update Decompositional Attention.ipynb

* fixed calls to similarity

* updated the README

* import sys package duh

* simplified indexing on mapping word to IDs

* stupid python indent error

* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround

* Fix typo (closes #2815) [ci skip]

* Update regex version dependency

* Set version to 2.0.13.dev3

* Skip seemingly problematic test

* Remove problematic test

* Try previous version of regex

* Revert "Remove problematic test"

This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.

* Unskip test

* Try older version of regex

* 💫 Update training examples and use minibatching (#2830)

<!--- Provide a general summary of your changes in the title. -->

## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.

### Types of change
enhancements

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Visual C++ link updated (#2842) (closes #2841) [ci skip]

* New landing page

* Add contribution agreement

* Correcting lang/ru/examples.py (#2845)

* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement

* Correct some grammatical inaccuracies in lang\ru\examples.py

* Move contributor agreement to separate file

* Set version to 2.0.13.dev4

* Add Persian(Farsi) language support (#2797)

* Also include lowercase norm exceptions

* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors

* Rule-based French Lemmatizer (#2818)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech 
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Set version to 2.0.13

* Fix formatting and consistency

* Update docs for new version [ci skip]

* Increment version [ci skip]

* Add info on wheels [ci skip]

* Adding "This is a sentence" example to Sinhala (#2846)

* Add wheels badge

* Update badge [ci skip]

* Update README.rst [ci skip]

* Update murmurhash pin

* Increment version to 2.0.14.dev0

* Update GPU docs for v2.0.14

* Add wheel to setup_requires

* Import prefer_gpu and require_gpu functions from Thinc

* Add tests for prefer_gpu() and require_gpu()

* Update requirements and setup.py

* Workaround bug in thinc require_gpu

* Set version to v2.0.14

* Update push-tag script

* Unhack prefer_gpu

* Require thinc 6.10.6

* Update prefer_gpu and require_gpu docs [ci skip]

* Fix specifiers for GPU

* Set version to 2.0.14.dev1

* Set version to 2.0.14

* Update Thinc version pin

* Increment version

* Fix msgpack-numpy version pin

* Increment version

* Update version to 2.0.16

* Update version [ci skip]

* Redundant ')' in the Stop words' example (#2856)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Documentation improvement regarding joblib and SO (#2867)

Some documentation improvements

## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)

### Types of change
Documentation

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* raise error when setting overlapping entities as doc.ents (#2880)

* Fix out-of-bounds access in NER training

The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!

This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.

* Change PyThaiNLP Url (#2876)

* Fix missing comma

* Add example showing a fix-up rule for space entities

* Set version to 2.0.17.dev0

* Update regex version

* Revert "Update regex version"

This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.

* Try setting older regex version, to align with conda

* Set version to 2.0.17

* Add spacy-js to universe [ci-skip]

* Add spacy-raspberry to universe (closes #2889)

* Add script to validate universe json [ci skip]

* Removed space in docs + added contributor indo (#2909)

* - removed unneeded space in documentation

* - added contributor info

* Allow input text of length up to max_length, inclusive (#2922)

* Include universe spec for spacy-wordnet component (#2919)

* feat: include universe spec for spacy-wordnet component

* chore: include spaCy contributor agreement

* Minor formatting changes [ci skip]

* Fix image [ci skip]

Twitter URL doesn't work on live site

* Check if the word is in one of the regular lists specific to each POS (#2886)

* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)

Resolves #2924.

## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)

### Types of change
bug fix

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix typo [ci skip]

* fixes symbolic link on py3 and windows (#2949)

* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948

* Update spacy/compat.py

Co-Authored-By: cicorias <cicorias@users.noreply.github.com>

* Fix formatting

* Update universe [ci skip]

* Catalan Language Support (#2940)

* Catalan language Support

* Ddding Catalan to documentation

* Sort languages alphabetically [ci skip]

* Update tests for pytest 4.x (#2965)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix regex pin to harmonize with conda (#2964)

* Update README.rst

* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)

Fixes #2976

* Fix typo

* Fix typo

* Remove duplicate file

* Require thinc 7.0.0.dev2

Fixes bug in gpu_ops that would use cupy instead of numpy on CPU

* Add missing import

* Fix error IDs

* Fix tests

											
										
										
											2018-11-29 18:30:29 +03:00
+								def remove_label_if_necessary(attributes):
 								    # More deprecated attribute handling =/
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								    if "label" in attributes:
 								        attributes["ent_type"] = attributes.pop("label")
-												💫 Port master changes over to develop (#2979)

* Create aryaprabhudesai.md (#2681)

* Update _install.jade (#2688)

Typo fix: "models" -> "model"

* Add FAC to spacy.explain (resolves #2706)

* Remove docstrings for deprecated arguments (see #2703)

* When calling getoption() in conftest.py, pass a default option (#2709)

* When calling getoption() in conftest.py, pass a default option

This is necessary to allow testing an installed spacy by running:

  pytest --pyargs spacy

* Add contributor agreement

* update bengali token rules for hyphen and digits (#2731)

* Less norm computations in token similarity (#2730)

* Less norm computations in token similarity

* Contributor agreement

* Remove ')' for clarity (#2737)

Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.

* added contributor agreement for mbkupfer (#2738)

* Basic support for Telugu language (#2751)

* Lex _attrs for polish language (#2750)

* Signed spaCy contributor agreement

* Added polish version of english lex_attrs

* Introduces a bulk merge function, in order to solve issue #653 (#2696)

* Fix comment

* Introduce bulk merge to increase performance on many span merges

* Sign contributor agreement

* Implement pull request suggestions

* Describe converters more explicitly (see #2643)

* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]

* Fix formatting

* Fix dependency scheme docs (closes #2705) [ci skip]

* Don't set stop word in example (closes #2657) [ci skip]

* Add words to portuguese language _num_words (#2759)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Update Indonesian model (#2752)

* adding e-KTP in tokenizer exceptions list

* add exception token

* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception

* add tokenizer exceptions list

* combining base_norms with norm_exceptions

* adding norm_exception

* fix double key in lemmatizer

* remove unused import on punctuation.py

* reformat stop_words to reduce number of lines, improve readibility

* updating tokenizer exception

* implement is_currency for lang/id

* adding orth_first_upper in tokenizer_exceptions

* update the norm_exception list

* remove bunch of abbreviations

* adding contributors file

* Fixed spaCy+Keras example (#2763)

* bug fixes in keras example

* created contributor agreement

* Adding French hyphenated first name (#2786)

* Fix typo (closes #2784)

* Fix typo (#2795) [ci skip]

Fixed typo on line 6 "regcognizer --> recognizer"

* Adding basic support for Sinhala language. (#2788)

* adding Sinhala language package, stop words, examples and lex_attrs.

* Adding contributor agreement

* Updating contributor agreement

* Also include lowercase norm exceptions

* Fix error (#2802)

* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way.  Use the resize function

* added spaCy Contributor Agreement

* Add charlax's contributor agreement (#2805)

* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)

* Contributors agreement

* Contributors agreement

* Contributors agreement

* Add jupyter=True to displacy.render in documentation (#2806)

* Revert "Also include lowercase norm exceptions"

This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.

* Remove deprecated encoding argument to msgpack

* Set up dependency tree pattern matching skeleton (#2732)

* Fix bug when too many entity types. Fixes #2800

* Fix Python 2 test failure

* Require older msgpack-numpy

* Restore encoding arg on msgpack-numpy

* Try to fix version pin for msgpack-numpy

* Update Portuguese Language (#2790)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols

* Extended punctuation and norm_exceptions in the Portuguese language

* Correct error in spacy universe docs concerning spacy-lookup (#2814)

* Update Keras Example for (Parikh et al, 2016) implementation  (#2803)

* bug fixes in keras example

* created contributor agreement

* baseline for Parikh model

* initial version of parikh 2016 implemented

* tested asymmetric models

* fixed grevious error in normalization

* use standard SNLI test file

* begin to rework parikh example

* initial version of running example

* start to document the new version

* start to document the new version

* Update Decompositional Attention.ipynb

* fixed calls to similarity

* updated the README

* import sys package duh

* simplified indexing on mapping word to IDs

* stupid python indent error

* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround

* Fix typo (closes #2815) [ci skip]

* Update regex version dependency

* Set version to 2.0.13.dev3

* Skip seemingly problematic test

* Remove problematic test

* Try previous version of regex

* Revert "Remove problematic test"

This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.

* Unskip test

* Try older version of regex

* 💫 Update training examples and use minibatching (#2830)

<!--- Provide a general summary of your changes in the title. -->

## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.

### Types of change
enhancements

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Visual C++ link updated (#2842) (closes #2841) [ci skip]

* New landing page

* Add contribution agreement

* Correcting lang/ru/examples.py (#2845)

* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement

* Correct some grammatical inaccuracies in lang\ru\examples.py

* Move contributor agreement to separate file

* Set version to 2.0.13.dev4

* Add Persian(Farsi) language support (#2797)

* Also include lowercase norm exceptions

* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors

* Rule-based French Lemmatizer (#2818)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech 
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Set version to 2.0.13

* Fix formatting and consistency

* Update docs for new version [ci skip]

* Increment version [ci skip]

* Add info on wheels [ci skip]

* Adding "This is a sentence" example to Sinhala (#2846)

* Add wheels badge

* Update badge [ci skip]

* Update README.rst [ci skip]

* Update murmurhash pin

* Increment version to 2.0.14.dev0

* Update GPU docs for v2.0.14

* Add wheel to setup_requires

* Import prefer_gpu and require_gpu functions from Thinc

* Add tests for prefer_gpu() and require_gpu()

* Update requirements and setup.py

* Workaround bug in thinc require_gpu

* Set version to v2.0.14

* Update push-tag script

* Unhack prefer_gpu

* Require thinc 6.10.6

* Update prefer_gpu and require_gpu docs [ci skip]

* Fix specifiers for GPU

* Set version to 2.0.14.dev1

* Set version to 2.0.14

* Update Thinc version pin

* Increment version

* Fix msgpack-numpy version pin

* Increment version

* Update version to 2.0.16

* Update version [ci skip]

* Redundant ')' in the Stop words' example (#2856)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Documentation improvement regarding joblib and SO (#2867)

Some documentation improvements

## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)

### Types of change
Documentation

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* raise error when setting overlapping entities as doc.ents (#2880)

* Fix out-of-bounds access in NER training

The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!

This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.

* Change PyThaiNLP Url (#2876)

* Fix missing comma

* Add example showing a fix-up rule for space entities

* Set version to 2.0.17.dev0

* Update regex version

* Revert "Update regex version"

This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.

* Try setting older regex version, to align with conda

* Set version to 2.0.17

* Add spacy-js to universe [ci-skip]

* Add spacy-raspberry to universe (closes #2889)

* Add script to validate universe json [ci skip]

* Removed space in docs + added contributor indo (#2909)

* - removed unneeded space in documentation

* - added contributor info

* Allow input text of length up to max_length, inclusive (#2922)

* Include universe spec for spacy-wordnet component (#2919)

* feat: include universe spec for spacy-wordnet component

* chore: include spaCy contributor agreement

* Minor formatting changes [ci skip]

* Fix image [ci skip]

Twitter URL doesn't work on live site

* Check if the word is in one of the regular lists specific to each POS (#2886)

* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)

Resolves #2924.

## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)

### Types of change
bug fix

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix typo [ci skip]

* fixes symbolic link on py3 and windows (#2949)

* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948

* Update spacy/compat.py

Co-Authored-By: cicorias <cicorias@users.noreply.github.com>

* Fix formatting

* Update universe [ci skip]

* Catalan Language Support (#2940)

* Catalan language Support

* Ddding Catalan to documentation

* Sort languages alphabetically [ci skip]

* Update tests for pytest 4.x (#2965)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix regex pin to harmonize with conda (#2964)

* Update README.rst

* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)

Fixes #2976

* Fix typo

* Fix typo

* Remove duplicate file

* Require thinc 7.0.0.dev2

Fixes bug in gpu_ops that would use cupy instead of numpy on CPU

* Add missing import

* Fix error IDs

* Fix tests

											
										
										
											2018-11-29 18:30:29 +03:00
 								def fix_attributes(doc, attributes):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								    if "label" in attributes and "ent_type" not in attributes:
 								        if isinstance(attributes["label"], int):
 								            attributes[ENT_TYPE] = attributes["label"]
-												💫 Port master changes over to develop (#2979)

* Create aryaprabhudesai.md (#2681)

* Update _install.jade (#2688)

Typo fix: "models" -> "model"

* Add FAC to spacy.explain (resolves #2706)

* Remove docstrings for deprecated arguments (see #2703)

* When calling getoption() in conftest.py, pass a default option (#2709)

* When calling getoption() in conftest.py, pass a default option

This is necessary to allow testing an installed spacy by running:

  pytest --pyargs spacy

* Add contributor agreement

* update bengali token rules for hyphen and digits (#2731)

* Less norm computations in token similarity (#2730)

* Less norm computations in token similarity

* Contributor agreement

* Remove ')' for clarity (#2737)

Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.

* added contributor agreement for mbkupfer (#2738)

* Basic support for Telugu language (#2751)

* Lex _attrs for polish language (#2750)

* Signed spaCy contributor agreement

* Added polish version of english lex_attrs

* Introduces a bulk merge function, in order to solve issue #653 (#2696)

* Fix comment

* Introduce bulk merge to increase performance on many span merges

* Sign contributor agreement

* Implement pull request suggestions

* Describe converters more explicitly (see #2643)

* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]

* Fix formatting

* Fix dependency scheme docs (closes #2705) [ci skip]

* Don't set stop word in example (closes #2657) [ci skip]

* Add words to portuguese language _num_words (#2759)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Update Indonesian model (#2752)

* adding e-KTP in tokenizer exceptions list

* add exception token

* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception

* add tokenizer exceptions list

* combining base_norms with norm_exceptions

* adding norm_exception

* fix double key in lemmatizer

* remove unused import on punctuation.py

* reformat stop_words to reduce number of lines, improve readibility

* updating tokenizer exception

* implement is_currency for lang/id

* adding orth_first_upper in tokenizer_exceptions

* update the norm_exception list

* remove bunch of abbreviations

* adding contributors file

* Fixed spaCy+Keras example (#2763)

* bug fixes in keras example

* created contributor agreement

* Adding French hyphenated first name (#2786)

* Fix typo (closes #2784)

* Fix typo (#2795) [ci skip]

Fixed typo on line 6 "regcognizer --> recognizer"

* Adding basic support for Sinhala language. (#2788)

* adding Sinhala language package, stop words, examples and lex_attrs.

* Adding contributor agreement

* Updating contributor agreement

* Also include lowercase norm exceptions

* Fix error (#2802)

* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way.  Use the resize function

* added spaCy Contributor Agreement

* Add charlax's contributor agreement (#2805)

* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)

* Contributors agreement

* Contributors agreement

* Contributors agreement

* Add jupyter=True to displacy.render in documentation (#2806)

* Revert "Also include lowercase norm exceptions"

This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.

* Remove deprecated encoding argument to msgpack

* Set up dependency tree pattern matching skeleton (#2732)

* Fix bug when too many entity types. Fixes #2800

* Fix Python 2 test failure

* Require older msgpack-numpy

* Restore encoding arg on msgpack-numpy

* Try to fix version pin for msgpack-numpy

* Update Portuguese Language (#2790)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols

* Extended punctuation and norm_exceptions in the Portuguese language

* Correct error in spacy universe docs concerning spacy-lookup (#2814)

* Update Keras Example for (Parikh et al, 2016) implementation  (#2803)

* bug fixes in keras example

* created contributor agreement

* baseline for Parikh model

* initial version of parikh 2016 implemented

* tested asymmetric models

* fixed grevious error in normalization

* use standard SNLI test file

* begin to rework parikh example

* initial version of running example

* start to document the new version

* start to document the new version

* Update Decompositional Attention.ipynb

* fixed calls to similarity

* updated the README

* import sys package duh

* simplified indexing on mapping word to IDs

* stupid python indent error

* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround

* Fix typo (closes #2815) [ci skip]

* Update regex version dependency

* Set version to 2.0.13.dev3

* Skip seemingly problematic test

* Remove problematic test

* Try previous version of regex

* Revert "Remove problematic test"

This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.

* Unskip test

* Try older version of regex

* 💫 Update training examples and use minibatching (#2830)

<!--- Provide a general summary of your changes in the title. -->

## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.

### Types of change
enhancements

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Visual C++ link updated (#2842) (closes #2841) [ci skip]

* New landing page

* Add contribution agreement

* Correcting lang/ru/examples.py (#2845)

* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement

* Correct some grammatical inaccuracies in lang\ru\examples.py

* Move contributor agreement to separate file

* Set version to 2.0.13.dev4

* Add Persian(Farsi) language support (#2797)

* Also include lowercase norm exceptions

* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors

* Rule-based French Lemmatizer (#2818)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech 
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Set version to 2.0.13

* Fix formatting and consistency

* Update docs for new version [ci skip]

* Increment version [ci skip]

* Add info on wheels [ci skip]

* Adding "This is a sentence" example to Sinhala (#2846)

* Add wheels badge

* Update badge [ci skip]

* Update README.rst [ci skip]

* Update murmurhash pin

* Increment version to 2.0.14.dev0

* Update GPU docs for v2.0.14

* Add wheel to setup_requires

* Import prefer_gpu and require_gpu functions from Thinc

* Add tests for prefer_gpu() and require_gpu()

* Update requirements and setup.py

* Workaround bug in thinc require_gpu

* Set version to v2.0.14

* Update push-tag script

* Unhack prefer_gpu

* Require thinc 6.10.6

* Update prefer_gpu and require_gpu docs [ci skip]

* Fix specifiers for GPU

* Set version to 2.0.14.dev1

* Set version to 2.0.14

* Update Thinc version pin

* Increment version

* Fix msgpack-numpy version pin

* Increment version

* Update version to 2.0.16

* Update version [ci skip]

* Redundant ')' in the Stop words' example (#2856)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Documentation improvement regarding joblib and SO (#2867)

Some documentation improvements

## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)

### Types of change
Documentation

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* raise error when setting overlapping entities as doc.ents (#2880)

* Fix out-of-bounds access in NER training

The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!

This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.

* Change PyThaiNLP Url (#2876)

* Fix missing comma

* Add example showing a fix-up rule for space entities

* Set version to 2.0.17.dev0

* Update regex version

* Revert "Update regex version"

This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.

* Try setting older regex version, to align with conda

* Set version to 2.0.17

* Add spacy-js to universe [ci-skip]

* Add spacy-raspberry to universe (closes #2889)

* Add script to validate universe json [ci skip]

* Removed space in docs + added contributor indo (#2909)

* - removed unneeded space in documentation

* - added contributor info

* Allow input text of length up to max_length, inclusive (#2922)

* Include universe spec for spacy-wordnet component (#2919)

* feat: include universe spec for spacy-wordnet component

* chore: include spaCy contributor agreement

* Minor formatting changes [ci skip]

* Fix image [ci skip]

Twitter URL doesn't work on live site

* Check if the word is in one of the regular lists specific to each POS (#2886)

* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)

Resolves #2924.

## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)

### Types of change
bug fix

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix typo [ci skip]

* fixes symbolic link on py3 and windows (#2949)

* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948

* Update spacy/compat.py

Co-Authored-By: cicorias <cicorias@users.noreply.github.com>

* Fix formatting

* Update universe [ci skip]

* Catalan Language Support (#2940)

* Catalan language Support

* Ddding Catalan to documentation

* Sort languages alphabetically [ci skip]

* Update tests for pytest 4.x (#2965)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix regex pin to harmonize with conda (#2964)

* Update README.rst

* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)

Fixes #2976

* Fix typo

* Fix typo

* Remove duplicate file

* Require thinc 7.0.0.dev2

Fixes bug in gpu_ops that would use cupy instead of numpy on CPU

* Add missing import

* Fix error IDs

* Fix tests

											
										
										
											2018-11-29 18:30:29 +03:00
+								        else:
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								            attributes[ENT_TYPE] = doc.vocab.strings[attributes["label"]]
 								    if "ent_type" in attributes:
 								        attributes[ENT_TYPE] = attributes["ent_type"]
-												💫 Port master changes over to develop (#2979)

* Create aryaprabhudesai.md (#2681)

* Update _install.jade (#2688)

Typo fix: "models" -> "model"

* Add FAC to spacy.explain (resolves #2706)

* Remove docstrings for deprecated arguments (see #2703)

* When calling getoption() in conftest.py, pass a default option (#2709)

* When calling getoption() in conftest.py, pass a default option

This is necessary to allow testing an installed spacy by running:

  pytest --pyargs spacy

* Add contributor agreement

* update bengali token rules for hyphen and digits (#2731)

* Less norm computations in token similarity (#2730)

* Less norm computations in token similarity

* Contributor agreement

* Remove ')' for clarity (#2737)

Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.

* added contributor agreement for mbkupfer (#2738)

* Basic support for Telugu language (#2751)

* Lex _attrs for polish language (#2750)

* Signed spaCy contributor agreement

* Added polish version of english lex_attrs

* Introduces a bulk merge function, in order to solve issue #653 (#2696)

* Fix comment

* Introduce bulk merge to increase performance on many span merges

* Sign contributor agreement

* Implement pull request suggestions

* Describe converters more explicitly (see #2643)

* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]

* Fix formatting

* Fix dependency scheme docs (closes #2705) [ci skip]

* Don't set stop word in example (closes #2657) [ci skip]

* Add words to portuguese language _num_words (#2759)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Update Indonesian model (#2752)

* adding e-KTP in tokenizer exceptions list

* add exception token

* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception

* add tokenizer exceptions list

* combining base_norms with norm_exceptions

* adding norm_exception

* fix double key in lemmatizer

* remove unused import on punctuation.py

* reformat stop_words to reduce number of lines, improve readibility

* updating tokenizer exception

* implement is_currency for lang/id

* adding orth_first_upper in tokenizer_exceptions

* update the norm_exception list

* remove bunch of abbreviations

* adding contributors file

* Fixed spaCy+Keras example (#2763)

* bug fixes in keras example

* created contributor agreement

* Adding French hyphenated first name (#2786)

* Fix typo (closes #2784)

* Fix typo (#2795) [ci skip]

Fixed typo on line 6 "regcognizer --> recognizer"

* Adding basic support for Sinhala language. (#2788)

* adding Sinhala language package, stop words, examples and lex_attrs.

* Adding contributor agreement

* Updating contributor agreement

* Also include lowercase norm exceptions

* Fix error (#2802)

* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way.  Use the resize function

* added spaCy Contributor Agreement

* Add charlax's contributor agreement (#2805)

* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)

* Contributors agreement

* Contributors agreement

* Contributors agreement

* Add jupyter=True to displacy.render in documentation (#2806)

* Revert "Also include lowercase norm exceptions"

This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.

* Remove deprecated encoding argument to msgpack

* Set up dependency tree pattern matching skeleton (#2732)

* Fix bug when too many entity types. Fixes #2800

* Fix Python 2 test failure

* Require older msgpack-numpy

* Restore encoding arg on msgpack-numpy

* Try to fix version pin for msgpack-numpy

* Update Portuguese Language (#2790)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols

* Extended punctuation and norm_exceptions in the Portuguese language

* Correct error in spacy universe docs concerning spacy-lookup (#2814)

* Update Keras Example for (Parikh et al, 2016) implementation  (#2803)

* bug fixes in keras example

* created contributor agreement

* baseline for Parikh model

* initial version of parikh 2016 implemented

* tested asymmetric models

* fixed grevious error in normalization

* use standard SNLI test file

* begin to rework parikh example

* initial version of running example

* start to document the new version

* start to document the new version

* Update Decompositional Attention.ipynb

* fixed calls to similarity

* updated the README

* import sys package duh

* simplified indexing on mapping word to IDs

* stupid python indent error

* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround

* Fix typo (closes #2815) [ci skip]

* Update regex version dependency

* Set version to 2.0.13.dev3

* Skip seemingly problematic test

* Remove problematic test

* Try previous version of regex

* Revert "Remove problematic test"

This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.

* Unskip test

* Try older version of regex

* 💫 Update training examples and use minibatching (#2830)

<!--- Provide a general summary of your changes in the title. -->

## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.

### Types of change
enhancements

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Visual C++ link updated (#2842) (closes #2841) [ci skip]

* New landing page

* Add contribution agreement

* Correcting lang/ru/examples.py (#2845)

* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement

* Correct some grammatical inaccuracies in lang\ru\examples.py

* Move contributor agreement to separate file

* Set version to 2.0.13.dev4

* Add Persian(Farsi) language support (#2797)

* Also include lowercase norm exceptions

* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors

* Rule-based French Lemmatizer (#2818)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech 
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Set version to 2.0.13

* Fix formatting and consistency

* Update docs for new version [ci skip]

* Increment version [ci skip]

* Add info on wheels [ci skip]

* Adding "This is a sentence" example to Sinhala (#2846)

* Add wheels badge

* Update badge [ci skip]

* Update README.rst [ci skip]

* Update murmurhash pin

* Increment version to 2.0.14.dev0

* Update GPU docs for v2.0.14

* Add wheel to setup_requires

* Import prefer_gpu and require_gpu functions from Thinc

* Add tests for prefer_gpu() and require_gpu()

* Update requirements and setup.py

* Workaround bug in thinc require_gpu

* Set version to v2.0.14

* Update push-tag script

* Unhack prefer_gpu

* Require thinc 6.10.6

* Update prefer_gpu and require_gpu docs [ci skip]

* Fix specifiers for GPU

* Set version to 2.0.14.dev1

* Set version to 2.0.14

* Update Thinc version pin

* Increment version

* Fix msgpack-numpy version pin

* Increment version

* Update version to 2.0.16

* Update version [ci skip]

* Redundant ')' in the Stop words' example (#2856)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Documentation improvement regarding joblib and SO (#2867)

Some documentation improvements

## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)

### Types of change
Documentation

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* raise error when setting overlapping entities as doc.ents (#2880)

* Fix out-of-bounds access in NER training

The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!

This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.

* Change PyThaiNLP Url (#2876)

* Fix missing comma

* Add example showing a fix-up rule for space entities

* Set version to 2.0.17.dev0

* Update regex version

* Revert "Update regex version"

This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.

* Try setting older regex version, to align with conda

* Set version to 2.0.17

* Add spacy-js to universe [ci-skip]

* Add spacy-raspberry to universe (closes #2889)

* Add script to validate universe json [ci skip]

* Removed space in docs + added contributor indo (#2909)

* - removed unneeded space in documentation

* - added contributor info

* Allow input text of length up to max_length, inclusive (#2922)

* Include universe spec for spacy-wordnet component (#2919)

* feat: include universe spec for spacy-wordnet component

* chore: include spaCy contributor agreement

* Minor formatting changes [ci skip]

* Fix image [ci skip]

Twitter URL doesn't work on live site

* Check if the word is in one of the regular lists specific to each POS (#2886)

* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)

Resolves #2924.

## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)

### Types of change
bug fix

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix typo [ci skip]

* fixes symbolic link on py3 and windows (#2949)

* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948

* Update spacy/compat.py

Co-Authored-By: cicorias <cicorias@users.noreply.github.com>

* Fix formatting

* Update universe [ci skip]

* Catalan Language Support (#2940)

* Catalan language Support

* Ddding Catalan to documentation

* Sort languages alphabetically [ci skip]

* Update tests for pytest 4.x (#2965)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix regex pin to harmonize with conda (#2964)

* Update README.rst

* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)

Fixes #2976

* Fix typo

* Fix typo

* Remove duplicate file

* Require thinc 7.0.0.dev2

Fixes bug in gpu_ops that would use cupy instead of numpy on CPU

* Add missing import

* Fix error IDs

* Fix tests

											
										
										
											2018-11-29 18:30:29 +03:00
 								def get_entity_info(ent_info):
 								    if isinstance(ent_info, Span):
 								        ent_type = ent_info.label
-												Ensure that doc.ents preserves kb_id annotations (#4294)

* bugfix: ensure doc.ents preserves kb_id annotations

* fix backward compatibility

* additional test

											
										
										
											2019-09-16 16:18:37 +03:00
+								        ent_kb_id = ent_info.kb_id
-												💫 Port master changes over to develop (#2979)

* Create aryaprabhudesai.md (#2681)

* Update _install.jade (#2688)

Typo fix: "models" -> "model"

* Add FAC to spacy.explain (resolves #2706)

* Remove docstrings for deprecated arguments (see #2703)

* When calling getoption() in conftest.py, pass a default option (#2709)

* When calling getoption() in conftest.py, pass a default option

This is necessary to allow testing an installed spacy by running:

  pytest --pyargs spacy

* Add contributor agreement

* update bengali token rules for hyphen and digits (#2731)

* Less norm computations in token similarity (#2730)

* Less norm computations in token similarity

* Contributor agreement

* Remove ')' for clarity (#2737)

Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.

* added contributor agreement for mbkupfer (#2738)

* Basic support for Telugu language (#2751)

* Lex _attrs for polish language (#2750)

* Signed spaCy contributor agreement

* Added polish version of english lex_attrs

* Introduces a bulk merge function, in order to solve issue #653 (#2696)

* Fix comment

* Introduce bulk merge to increase performance on many span merges

* Sign contributor agreement

* Implement pull request suggestions

* Describe converters more explicitly (see #2643)

* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]

* Fix formatting

* Fix dependency scheme docs (closes #2705) [ci skip]

* Don't set stop word in example (closes #2657) [ci skip]

* Add words to portuguese language _num_words (#2759)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Update Indonesian model (#2752)

* adding e-KTP in tokenizer exceptions list

* add exception token

* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception

* add tokenizer exceptions list

* combining base_norms with norm_exceptions

* adding norm_exception

* fix double key in lemmatizer

* remove unused import on punctuation.py

* reformat stop_words to reduce number of lines, improve readibility

* updating tokenizer exception

* implement is_currency for lang/id

* adding orth_first_upper in tokenizer_exceptions

* update the norm_exception list

* remove bunch of abbreviations

* adding contributors file

* Fixed spaCy+Keras example (#2763)

* bug fixes in keras example

* created contributor agreement

* Adding French hyphenated first name (#2786)

* Fix typo (closes #2784)

* Fix typo (#2795) [ci skip]

Fixed typo on line 6 "regcognizer --> recognizer"

* Adding basic support for Sinhala language. (#2788)

* adding Sinhala language package, stop words, examples and lex_attrs.

* Adding contributor agreement

* Updating contributor agreement

* Also include lowercase norm exceptions

* Fix error (#2802)

* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way.  Use the resize function

* added spaCy Contributor Agreement

* Add charlax's contributor agreement (#2805)

* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)

* Contributors agreement

* Contributors agreement

* Contributors agreement

* Add jupyter=True to displacy.render in documentation (#2806)

* Revert "Also include lowercase norm exceptions"

This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.

* Remove deprecated encoding argument to msgpack

* Set up dependency tree pattern matching skeleton (#2732)

* Fix bug when too many entity types. Fixes #2800

* Fix Python 2 test failure

* Require older msgpack-numpy

* Restore encoding arg on msgpack-numpy

* Try to fix version pin for msgpack-numpy

* Update Portuguese Language (#2790)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols

* Extended punctuation and norm_exceptions in the Portuguese language

* Correct error in spacy universe docs concerning spacy-lookup (#2814)

* Update Keras Example for (Parikh et al, 2016) implementation  (#2803)

* bug fixes in keras example

* created contributor agreement

* baseline for Parikh model

* initial version of parikh 2016 implemented

* tested asymmetric models

* fixed grevious error in normalization

* use standard SNLI test file

* begin to rework parikh example

* initial version of running example

* start to document the new version

* start to document the new version

* Update Decompositional Attention.ipynb

* fixed calls to similarity

* updated the README

* import sys package duh

* simplified indexing on mapping word to IDs

* stupid python indent error

* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround

* Fix typo (closes #2815) [ci skip]

* Update regex version dependency

* Set version to 2.0.13.dev3

* Skip seemingly problematic test

* Remove problematic test

* Try previous version of regex

* Revert "Remove problematic test"

This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.

* Unskip test

* Try older version of regex

* 💫 Update training examples and use minibatching (#2830)

<!--- Provide a general summary of your changes in the title. -->

## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.

### Types of change
enhancements

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Visual C++ link updated (#2842) (closes #2841) [ci skip]

* New landing page

* Add contribution agreement

* Correcting lang/ru/examples.py (#2845)

* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement

* Correct some grammatical inaccuracies in lang\ru\examples.py

* Move contributor agreement to separate file

* Set version to 2.0.13.dev4

* Add Persian(Farsi) language support (#2797)

* Also include lowercase norm exceptions

* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors

* Rule-based French Lemmatizer (#2818)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech 
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Set version to 2.0.13

* Fix formatting and consistency

* Update docs for new version [ci skip]

* Increment version [ci skip]

* Add info on wheels [ci skip]

* Adding "This is a sentence" example to Sinhala (#2846)

* Add wheels badge

* Update badge [ci skip]

* Update README.rst [ci skip]

* Update murmurhash pin

* Increment version to 2.0.14.dev0

* Update GPU docs for v2.0.14

* Add wheel to setup_requires

* Import prefer_gpu and require_gpu functions from Thinc

* Add tests for prefer_gpu() and require_gpu()

* Update requirements and setup.py

* Workaround bug in thinc require_gpu

* Set version to v2.0.14

* Update push-tag script

* Unhack prefer_gpu

* Require thinc 6.10.6

* Update prefer_gpu and require_gpu docs [ci skip]

* Fix specifiers for GPU

* Set version to 2.0.14.dev1

* Set version to 2.0.14

* Update Thinc version pin

* Increment version

* Fix msgpack-numpy version pin

* Increment version

* Update version to 2.0.16

* Update version [ci skip]

* Redundant ')' in the Stop words' example (#2856)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Documentation improvement regarding joblib and SO (#2867)

Some documentation improvements

## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)

### Types of change
Documentation

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* raise error when setting overlapping entities as doc.ents (#2880)

* Fix out-of-bounds access in NER training

The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!

This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.

* Change PyThaiNLP Url (#2876)

* Fix missing comma

* Add example showing a fix-up rule for space entities

* Set version to 2.0.17.dev0

* Update regex version

* Revert "Update regex version"

This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.

* Try setting older regex version, to align with conda

* Set version to 2.0.17

* Add spacy-js to universe [ci-skip]

* Add spacy-raspberry to universe (closes #2889)

* Add script to validate universe json [ci skip]

* Removed space in docs + added contributor indo (#2909)

* - removed unneeded space in documentation

* - added contributor info

* Allow input text of length up to max_length, inclusive (#2922)

* Include universe spec for spacy-wordnet component (#2919)

* feat: include universe spec for spacy-wordnet component

* chore: include spaCy contributor agreement

* Minor formatting changes [ci skip]

* Fix image [ci skip]

Twitter URL doesn't work on live site

* Check if the word is in one of the regular lists specific to each POS (#2886)

* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)

Resolves #2924.

## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)

### Types of change
bug fix

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix typo [ci skip]

* fixes symbolic link on py3 and windows (#2949)

* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948

* Update spacy/compat.py

Co-Authored-By: cicorias <cicorias@users.noreply.github.com>

* Fix formatting

* Update universe [ci skip]

* Catalan Language Support (#2940)

* Catalan language Support

* Ddding Catalan to documentation

* Sort languages alphabetically [ci skip]

* Update tests for pytest 4.x (#2965)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix regex pin to harmonize with conda (#2964)

* Update README.rst

* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)

Fixes #2976

* Fix typo

* Fix typo

* Remove duplicate file

* Require thinc 7.0.0.dev2

Fixes bug in gpu_ops that would use cupy instead of numpy on CPU

* Add missing import

* Fix error IDs

* Fix tests

											
										
										
											2018-11-29 18:30:29 +03:00
+								        start = ent_info.start
 								        end = ent_info.end
 								    elif len(ent_info) == 3:
 								        ent_type, start, end = ent_info
-												Ensure that doc.ents preserves kb_id annotations (#4294)

* bugfix: ensure doc.ents preserves kb_id annotations

* fix backward compatibility

* additional test

											
										
										
											2019-09-16 16:18:37 +03:00
+								        ent_kb_id = 0
 								    elif len(ent_info) == 4:
 								        ent_type, ent_kb_id, start, end = ent_info
-												💫 Port master changes over to develop (#2979)

* Create aryaprabhudesai.md (#2681)

* Update _install.jade (#2688)

Typo fix: "models" -> "model"

* Add FAC to spacy.explain (resolves #2706)

* Remove docstrings for deprecated arguments (see #2703)

* When calling getoption() in conftest.py, pass a default option (#2709)

* When calling getoption() in conftest.py, pass a default option

This is necessary to allow testing an installed spacy by running:

  pytest --pyargs spacy

* Add contributor agreement

* update bengali token rules for hyphen and digits (#2731)

* Less norm computations in token similarity (#2730)

* Less norm computations in token similarity

* Contributor agreement

* Remove ')' for clarity (#2737)

Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.

* added contributor agreement for mbkupfer (#2738)

* Basic support for Telugu language (#2751)

* Lex _attrs for polish language (#2750)

* Signed spaCy contributor agreement

* Added polish version of english lex_attrs

* Introduces a bulk merge function, in order to solve issue #653 (#2696)

* Fix comment

* Introduce bulk merge to increase performance on many span merges

* Sign contributor agreement

* Implement pull request suggestions

* Describe converters more explicitly (see #2643)

* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]

* Fix formatting

* Fix dependency scheme docs (closes #2705) [ci skip]

* Don't set stop word in example (closes #2657) [ci skip]

* Add words to portuguese language _num_words (#2759)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Update Indonesian model (#2752)

* adding e-KTP in tokenizer exceptions list

* add exception token

* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception

* add tokenizer exceptions list

* combining base_norms with norm_exceptions

* adding norm_exception

* fix double key in lemmatizer

* remove unused import on punctuation.py

* reformat stop_words to reduce number of lines, improve readibility

* updating tokenizer exception

* implement is_currency for lang/id

* adding orth_first_upper in tokenizer_exceptions

* update the norm_exception list

* remove bunch of abbreviations

* adding contributors file

* Fixed spaCy+Keras example (#2763)

* bug fixes in keras example

* created contributor agreement

* Adding French hyphenated first name (#2786)

* Fix typo (closes #2784)

* Fix typo (#2795) [ci skip]

Fixed typo on line 6 "regcognizer --> recognizer"

* Adding basic support for Sinhala language. (#2788)

* adding Sinhala language package, stop words, examples and lex_attrs.

* Adding contributor agreement

* Updating contributor agreement

* Also include lowercase norm exceptions

* Fix error (#2802)

* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way.  Use the resize function

* added spaCy Contributor Agreement

* Add charlax's contributor agreement (#2805)

* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)

* Contributors agreement

* Contributors agreement

* Contributors agreement

* Add jupyter=True to displacy.render in documentation (#2806)

* Revert "Also include lowercase norm exceptions"

This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.

* Remove deprecated encoding argument to msgpack

* Set up dependency tree pattern matching skeleton (#2732)

* Fix bug when too many entity types. Fixes #2800

* Fix Python 2 test failure

* Require older msgpack-numpy

* Restore encoding arg on msgpack-numpy

* Try to fix version pin for msgpack-numpy

* Update Portuguese Language (#2790)

* Add words to portuguese language _num_words

* Add words to portuguese language _num_words

* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols

* Extended punctuation and norm_exceptions in the Portuguese language

* Correct error in spacy universe docs concerning spacy-lookup (#2814)

* Update Keras Example for (Parikh et al, 2016) implementation  (#2803)

* bug fixes in keras example

* created contributor agreement

* baseline for Parikh model

* initial version of parikh 2016 implemented

* tested asymmetric models

* fixed grevious error in normalization

* use standard SNLI test file

* begin to rework parikh example

* initial version of running example

* start to document the new version

* start to document the new version

* Update Decompositional Attention.ipynb

* fixed calls to similarity

* updated the README

* import sys package duh

* simplified indexing on mapping word to IDs

* stupid python indent error

* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround

* Fix typo (closes #2815) [ci skip]

* Update regex version dependency

* Set version to 2.0.13.dev3

* Skip seemingly problematic test

* Remove problematic test

* Try previous version of regex

* Revert "Remove problematic test"

This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.

* Unskip test

* Try older version of regex

* 💫 Update training examples and use minibatching (#2830)

<!--- Provide a general summary of your changes in the title. -->

## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.

### Types of change
enhancements

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Visual C++ link updated (#2842) (closes #2841) [ci skip]

* New landing page

* Add contribution agreement

* Correcting lang/ru/examples.py (#2845)

* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement

* Correct some grammatical inaccuracies in lang\ru\examples.py

* Move contributor agreement to separate file

* Set version to 2.0.13.dev4

* Add Persian(Farsi) language support (#2797)

* Also include lowercase norm exceptions

* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors

* Rule-based French Lemmatizer (#2818)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech 
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Set version to 2.0.13

* Fix formatting and consistency

* Update docs for new version [ci skip]

* Increment version [ci skip]

* Add info on wheels [ci skip]

* Adding "This is a sentence" example to Sinhala (#2846)

* Add wheels badge

* Update badge [ci skip]

* Update README.rst [ci skip]

* Update murmurhash pin

* Increment version to 2.0.14.dev0

* Update GPU docs for v2.0.14

* Add wheel to setup_requires

* Import prefer_gpu and require_gpu functions from Thinc

* Add tests for prefer_gpu() and require_gpu()

* Update requirements and setup.py

* Workaround bug in thinc require_gpu

* Set version to v2.0.14

* Update push-tag script

* Unhack prefer_gpu

* Require thinc 6.10.6

* Update prefer_gpu and require_gpu docs [ci skip]

* Fix specifiers for GPU

* Set version to 2.0.14.dev1

* Set version to 2.0.14

* Update Thinc version pin

* Increment version

* Fix msgpack-numpy version pin

* Increment version

* Update version to 2.0.16

* Update version [ci skip]

* Redundant ')' in the Stop words' example (#2856)

<!--- Provide a general summary of your changes in the title. -->

## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Documentation improvement regarding joblib and SO (#2867)

Some documentation improvements

## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)

### Types of change
Documentation

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* raise error when setting overlapping entities as doc.ents (#2880)

* Fix out-of-bounds access in NER training

The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!

This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.

* Change PyThaiNLP Url (#2876)

* Fix missing comma

* Add example showing a fix-up rule for space entities

* Set version to 2.0.17.dev0

* Update regex version

* Revert "Update regex version"

This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.

* Try setting older regex version, to align with conda

* Set version to 2.0.17

* Add spacy-js to universe [ci-skip]

* Add spacy-raspberry to universe (closes #2889)

* Add script to validate universe json [ci skip]

* Removed space in docs + added contributor indo (#2909)

* - removed unneeded space in documentation

* - added contributor info

* Allow input text of length up to max_length, inclusive (#2922)

* Include universe spec for spacy-wordnet component (#2919)

* feat: include universe spec for spacy-wordnet component

* chore: include spaCy contributor agreement

* Minor formatting changes [ci skip]

* Fix image [ci skip]

Twitter URL doesn't work on live site

* Check if the word is in one of the regular lists specific to each POS (#2886)

* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)

Resolves #2924.

## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)

### Types of change
bug fix

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix typo [ci skip]

* fixes symbolic link on py3 and windows (#2949)

* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948

* Update spacy/compat.py

Co-Authored-By: cicorias <cicorias@users.noreply.github.com>

* Fix formatting

* Update universe [ci skip]

* Catalan Language Support (#2940)

* Catalan language Support

* Ddding Catalan to documentation

* Sort languages alphabetically [ci skip]

* Update tests for pytest 4.x (#2965)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)

### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Fix regex pin to harmonize with conda (#2964)

* Update README.rst

* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)

Fixes #2976

* Fix typo

* Fix typo

* Remove duplicate file

* Require thinc 7.0.0.dev2

Fixes bug in gpu_ops that would use cupy instead of numpy on CPU

* Add missing import

* Fix error IDs

* Fix tests

											
										
										
											2018-11-29 18:30:29 +03:00
+								    else:
-												Ensure that doc.ents preserves kb_id annotations (#4294)

* bugfix: ensure doc.ents preserves kb_id annotations

* fix backward compatibility

* additional test

											
										
										
											2019-09-16 16:18:37 +03:00
+								        ent_id, ent_kb_id, ent_type, start, end = ent_info
 								    return ent_type, ent_kb_id, start, end