2020-03-02 13:48:10 +03:00
|
|
|
|
# cython: infer_types=True, bounds_check=False, profile=True
|
2017-04-15 14:05:15 +03:00
|
|
|
|
cimport cython
|
|
|
|
|
cimport numpy as np
|
2020-09-16 21:32:38 +03:00
|
|
|
|
from libc.string cimport memcpy
|
2019-03-08 13:42:26 +03:00
|
|
|
|
from libc.math cimport sqrt
|
2020-06-26 20:34:12 +03:00
|
|
|
|
from libc.stdint cimport int32_t, uint64_t
|
2019-03-08 13:42:26 +03:00
|
|
|
|
|
2020-07-03 12:32:42 +03:00
|
|
|
|
import copy
|
2021-03-29 14:34:01 +03:00
|
|
|
|
from collections import Counter, defaultdict
|
2020-09-21 16:54:05 +03:00
|
|
|
|
from enum import Enum
|
2020-09-22 15:42:51 +03:00
|
|
|
|
import itertools
|
2015-07-13 20:58:26 +03:00
|
|
|
|
import numpy
|
2018-12-06 20:46:09 +03:00
|
|
|
|
import srsly
|
2020-02-18 17:38:18 +03:00
|
|
|
|
from thinc.api import get_array_module
|
|
|
|
|
from thinc.util import copy_array
|
2020-04-28 14:37:37 +03:00
|
|
|
|
import warnings
|
2017-04-15 14:05:15 +03:00
|
|
|
|
|
2017-05-13 14:04:40 +03:00
|
|
|
|
from .span cimport Span
|
2021-01-13 16:20:05 +03:00
|
|
|
|
from .token cimport MISSING_DEP
|
2021-01-14 09:30:41 +03:00
|
|
|
|
from ._dict_proxies import SpanGroups
|
2017-05-13 14:04:40 +03:00
|
|
|
|
from .token cimport Token
|
|
|
|
|
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
2015-07-16 12:21:44 +03:00
|
|
|
|
from ..typedefs cimport attr_t, flags_t
|
2020-09-16 21:32:38 +03:00
|
|
|
|
from ..attrs cimport attr_id_t
|
2020-06-26 20:34:12 +03:00
|
|
|
|
from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB
|
2020-09-16 21:32:38 +03:00
|
|
|
|
from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, NORM
|
2019-03-08 13:42:26 +03:00
|
|
|
|
|
2020-09-16 21:32:38 +03:00
|
|
|
|
from ..attrs import intify_attr, IDS
|
2019-12-22 03:53:56 +03:00
|
|
|
|
from ..compat import copy_reg, pickle
|
2018-05-21 02:22:38 +03:00
|
|
|
|
from ..errors import Errors, Warnings
|
2020-09-17 01:14:01 +03:00
|
|
|
|
from ..morphology import Morphology
|
2017-05-31 00:35:17 +03:00
|
|
|
|
from .. import util
|
2021-09-16 14:28:05 +03:00
|
|
|
|
from .. import parts_of_speech
|
2018-04-03 19:30:17 +03:00
|
|
|
|
from .underscore import Underscore, get_ext_args
|
Add doc.retokenize() context manager (#2172)
This patch takes a step towards #1487 by introducing the
doc.retokenize() context manager, to handle merging spans, and soon
splitting tokens.
The idea is to do merging and splitting like this:
with doc.retokenize() as retokenizer:
for start, end, label in matches:
retokenizer.merge(doc[start : end], attrs={'ent_type': label})
The retokenizer accumulates the merge requests, and applies them
together at the end of the block. This will allow retokenization to be
more efficient, and much less error prone.
A retokenizer.split() function will then be added, to handle splitting a
single token into multiple tokens. These methods take `Span` and `Token`
objects; if the user wants to go directly from offsets, they can append
to the .merges and .splits lists on the retokenizer.
The doc.merge() method's behaviour remains unchanged, so this patch
should be 100% backwards incompatible (modulo bugs). Internally,
doc.merge() fixes up the arguments (to handle the various deprecated styles),
opens the retokenizer, and makes the single merge.
We can later start making deprecation warnings on direct calls to doc.merge(),
to migrate people to use of the retokenize context manager.
2018-04-03 15:10:35 +03:00
|
|
|
|
from ._retokenize import Retokenizer
|
2020-09-17 01:14:01 +03:00
|
|
|
|
from ._serialize import ALL_ATTRS as DOCBIN_ALL_ATTRS
|
2015-07-13 20:58:26 +03:00
|
|
|
|
|
2019-03-08 13:42:26 +03:00
|
|
|
|
|
2015-07-13 20:58:26 +03:00
|
|
|
|
DEF PADDING = 5
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cdef int bounds_check(int i, int length, int padding) except -1:
|
|
|
|
|
if (i + padding) < 0:
|
2018-04-03 16:50:31 +03:00
|
|
|
|
raise IndexError(Errors.E026.format(i=i, length=length))
|
2015-07-13 20:58:26 +03:00
|
|
|
|
if (i - padding) >= length:
|
2018-04-03 16:50:31 +03:00
|
|
|
|
raise IndexError(Errors.E026.format(i=i, length=length))
|
2015-07-13 20:58:26 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
|
|
|
|
|
if feat_name == LEMMA:
|
|
|
|
|
return token.lemma
|
2019-08-06 00:51:04 +03:00
|
|
|
|
elif feat_name == NORM:
|
|
|
|
|
if not token.norm:
|
|
|
|
|
return token.lex.norm
|
|
|
|
|
return token.norm
|
2015-07-13 20:58:26 +03:00
|
|
|
|
elif feat_name == POS:
|
|
|
|
|
return token.pos
|
|
|
|
|
elif feat_name == TAG:
|
|
|
|
|
return token.tag
|
2020-06-26 20:34:12 +03:00
|
|
|
|
elif feat_name == MORPH:
|
|
|
|
|
return token.morph
|
2015-07-13 20:58:26 +03:00
|
|
|
|
elif feat_name == DEP:
|
|
|
|
|
return token.dep
|
2015-07-16 02:15:34 +03:00
|
|
|
|
elif feat_name == HEAD:
|
|
|
|
|
return token.head
|
2016-05-05 13:11:57 +03:00
|
|
|
|
elif feat_name == SENT_START:
|
|
|
|
|
return token.sent_start
|
2015-07-16 02:15:34 +03:00
|
|
|
|
elif feat_name == SPACY:
|
|
|
|
|
return token.spacy
|
|
|
|
|
elif feat_name == ENT_IOB:
|
|
|
|
|
return token.ent_iob
|
|
|
|
|
elif feat_name == ENT_TYPE:
|
|
|
|
|
return token.ent_type
|
2020-01-06 16:57:34 +03:00
|
|
|
|
elif feat_name == ENT_ID:
|
|
|
|
|
return token.ent_id
|
2019-06-25 16:28:51 +03:00
|
|
|
|
elif feat_name == ENT_KB_ID:
|
|
|
|
|
return token.ent_kb_id
|
2020-02-22 16:13:06 +03:00
|
|
|
|
elif feat_name == IDX:
|
|
|
|
|
return token.idx
|
2015-07-13 20:58:26 +03:00
|
|
|
|
else:
|
2015-09-06 20:45:15 +03:00
|
|
|
|
return Lexeme.get_struct_attr(token.lex, feat_name)
|
2015-07-13 20:58:26 +03:00
|
|
|
|
|
2017-10-07 19:56:01 +03:00
|
|
|
|
|
2020-04-29 13:57:30 +03:00
|
|
|
|
cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name) nogil:
|
|
|
|
|
if feat_name == SENT_START:
|
|
|
|
|
if token.sent_start == 1:
|
|
|
|
|
return True
|
|
|
|
|
else:
|
|
|
|
|
return False
|
|
|
|
|
else:
|
|
|
|
|
return get_token_attr(token, feat_name)
|
|
|
|
|
|
|
|
|
|
|
2020-09-21 16:54:05 +03:00
|
|
|
|
class SetEntsDefault(str, Enum):
|
|
|
|
|
blocked = "blocked"
|
|
|
|
|
missing = "missing"
|
|
|
|
|
outside = "outside"
|
|
|
|
|
unmodified = "unmodified"
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def values(cls):
|
|
|
|
|
return list(cls.__members__.keys())
|
|
|
|
|
|
|
|
|
|
|
2015-07-13 20:58:26 +03:00
|
|
|
|
cdef class Doc:
|
2017-05-18 23:17:09 +03:00
|
|
|
|
"""A sequence of Token objects. Access sentences and named entities, export
|
2017-10-27 16:41:45 +03:00
|
|
|
|
annotations to numpy arrays, losslessly serialize to compressed binary
|
|
|
|
|
strings. The `Doc` object holds an array of `TokenC` structs. The
|
|
|
|
|
Python-level `Token` and `Span` objects are views of this array, i.e.
|
|
|
|
|
they don't own the data themselves.
|
2017-05-18 23:17:09 +03:00
|
|
|
|
|
2019-07-10 11:16:48 +03:00
|
|
|
|
EXAMPLE:
|
|
|
|
|
Construction 1
|
2017-05-18 23:17:09 +03:00
|
|
|
|
>>> doc = nlp(u'Some text')
|
|
|
|
|
|
|
|
|
|
Construction 2
|
|
|
|
|
>>> from spacy.tokens import Doc
|
2020-08-17 17:45:24 +03:00
|
|
|
|
>>> doc = Doc(nlp.vocab, words=["hello", "world", "!"], spaces=[True, False, False])
|
2019-03-08 13:42:26 +03:00
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
|
DOCS: https://spacy.io/api/doc
|
2015-07-13 20:58:26 +03:00
|
|
|
|
"""
|
2019-03-08 13:42:26 +03:00
|
|
|
|
|
2017-10-07 19:56:01 +03:00
|
|
|
|
@classmethod
|
2018-04-03 19:30:17 +03:00
|
|
|
|
def set_extension(cls, name, **kwargs):
|
2019-03-08 13:42:26 +03:00
|
|
|
|
"""Define a custom attribute which becomes available as `Doc._`.
|
|
|
|
|
|
2020-05-24 18:20:58 +03:00
|
|
|
|
name (str): Name of the attribute to set.
|
2019-03-08 13:42:26 +03:00
|
|
|
|
default: Optional default value of the attribute.
|
|
|
|
|
getter (callable): Optional getter function.
|
|
|
|
|
setter (callable): Optional setter function.
|
|
|
|
|
method (callable): Optional method for method extension.
|
|
|
|
|
force (bool): Force overwriting existing attribute.
|
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
|
DOCS: https://spacy.io/api/doc#set_extension
|
|
|
|
|
USAGE: https://spacy.io/usage/processing-pipelines#custom-components-attributes
|
2019-03-08 13:42:26 +03:00
|
|
|
|
"""
|
|
|
|
|
if cls.has_extension(name) and not kwargs.get("force", False):
|
|
|
|
|
raise ValueError(Errors.E090.format(name=name, obj="Doc"))
|
2018-04-03 19:30:17 +03:00
|
|
|
|
Underscore.doc_extensions[name] = get_ext_args(**kwargs)
|
2017-10-07 19:56:01 +03:00
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def get_extension(cls, name):
|
2019-03-08 13:42:26 +03:00
|
|
|
|
"""Look up a previously registered extension by name.
|
|
|
|
|
|
2020-05-24 18:20:58 +03:00
|
|
|
|
name (str): Name of the extension.
|
2019-03-08 13:42:26 +03:00
|
|
|
|
RETURNS (tuple): A `(default, method, getter, setter)` tuple.
|
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
|
DOCS: https://spacy.io/api/doc#get_extension
|
2019-03-08 13:42:26 +03:00
|
|
|
|
"""
|
2017-10-07 19:56:01 +03:00
|
|
|
|
return Underscore.doc_extensions.get(name)
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def has_extension(cls, name):
|
2019-03-08 13:42:26 +03:00
|
|
|
|
"""Check whether an extension has been registered.
|
|
|
|
|
|
2020-05-24 18:20:58 +03:00
|
|
|
|
name (str): Name of the extension.
|
2019-03-08 13:42:26 +03:00
|
|
|
|
RETURNS (bool): Whether the extension has been registered.
|
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
|
DOCS: https://spacy.io/api/doc#has_extension
|
2019-03-08 13:42:26 +03:00
|
|
|
|
"""
|
2017-10-07 19:56:01 +03:00
|
|
|
|
return name in Underscore.doc_extensions
|
|
|
|
|
|
2018-04-29 00:33:09 +03:00
|
|
|
|
@classmethod
|
|
|
|
|
def remove_extension(cls, name):
|
2019-03-08 13:42:26 +03:00
|
|
|
|
"""Remove a previously registered extension.
|
|
|
|
|
|
2020-05-24 18:20:58 +03:00
|
|
|
|
name (str): Name of the extension.
|
2019-03-08 13:42:26 +03:00
|
|
|
|
RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
|
|
|
|
|
removed extension.
|
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
|
DOCS: https://spacy.io/api/doc#remove_extension
|
2019-03-08 13:42:26 +03:00
|
|
|
|
"""
|
2018-04-29 00:33:09 +03:00
|
|
|
|
if not cls.has_extension(name):
|
|
|
|
|
raise ValueError(Errors.E046.format(name=name))
|
|
|
|
|
return Underscore.doc_extensions.pop(name)
|
|
|
|
|
|
2020-09-21 14:01:26 +03:00
|
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
|
Vocab vocab,
|
|
|
|
|
words=None,
|
|
|
|
|
spaces=None,
|
|
|
|
|
*,
|
2020-09-21 17:00:06 +03:00
|
|
|
|
user_data=None,
|
2020-09-21 14:01:26 +03:00
|
|
|
|
tags=None,
|
|
|
|
|
pos=None,
|
|
|
|
|
morphs=None,
|
|
|
|
|
lemmas=None,
|
|
|
|
|
heads=None,
|
|
|
|
|
deps=None,
|
2020-09-21 18:59:09 +03:00
|
|
|
|
sent_starts=None,
|
2020-09-21 14:01:26 +03:00
|
|
|
|
ents=None,
|
|
|
|
|
):
|
2017-05-18 23:17:09 +03:00
|
|
|
|
"""Create a Doc object.
|
2016-09-28 12:15:13 +03:00
|
|
|
|
|
2017-10-27 16:41:45 +03:00
|
|
|
|
vocab (Vocab): A vocabulary object, which must match any models you
|
|
|
|
|
want to use (e.g. tokenizer, parser, entity recognizer).
|
2021-10-29 13:08:40 +03:00
|
|
|
|
words (Optional[List[Union[str, int]]]): A list of unicode strings or
|
|
|
|
|
hash values to add to the document as words. If `None`, defaults to
|
|
|
|
|
empty list.
|
|
|
|
|
spaces (Optional[List[bool]]): A list of boolean values, of the same
|
|
|
|
|
length as `words`. `True` means that the word is followed by a space,
|
|
|
|
|
`False` means it is not. If `None`, defaults to `[True]*len(words)`
|
2017-10-17 17:11:13 +03:00
|
|
|
|
user_data (dict or None): Optional extra data to attach to the Doc.
|
2020-09-21 18:59:09 +03:00
|
|
|
|
tags (Optional[List[str]]): A list of unicode strings, of the same
|
|
|
|
|
length as words, to assign as token.tag. Defaults to None.
|
|
|
|
|
pos (Optional[List[str]]): A list of unicode strings, of the same
|
|
|
|
|
length as words, to assign as token.pos. Defaults to None.
|
|
|
|
|
morphs (Optional[List[str]]): A list of unicode strings, of the same
|
|
|
|
|
length as words, to assign as token.morph. Defaults to None.
|
|
|
|
|
lemmas (Optional[List[str]]): A list of unicode strings, of the same
|
|
|
|
|
length as words, to assign as token.lemma. Defaults to None.
|
|
|
|
|
heads (Optional[List[int]]): A list of values, of the same length as
|
|
|
|
|
words, to assign as heads. Head indices are the position of the
|
|
|
|
|
head in the doc. Defaults to None.
|
|
|
|
|
deps (Optional[List[str]]): A list of unicode strings, of the same
|
|
|
|
|
length as words, to assign as token.dep. Defaults to None.
|
|
|
|
|
sent_starts (Optional[List[Union[bool, None]]]): A list of values, of
|
|
|
|
|
the same length as words, to assign as token.is_sent_start. Will be
|
|
|
|
|
overridden by heads if heads is provided. Defaults to None.
|
2020-10-01 17:22:18 +03:00
|
|
|
|
ents (Optional[List[str]]): A list of unicode strings, of the same
|
|
|
|
|
length as words, as IOB tags to assign as token.ent_iob and
|
|
|
|
|
token.ent_type. Defaults to None.
|
2019-03-08 13:42:26 +03:00
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
|
DOCS: https://spacy.io/api/doc#init
|
2017-04-15 14:05:15 +03:00
|
|
|
|
"""
|
2015-07-13 20:58:26 +03:00
|
|
|
|
self.vocab = vocab
|
2020-06-26 20:34:12 +03:00
|
|
|
|
size = max(20, (len(words) if words is not None else 0))
|
2015-07-13 20:58:26 +03:00
|
|
|
|
self.mem = Pool()
|
2021-01-14 09:30:41 +03:00
|
|
|
|
self.spans = SpanGroups(self)
|
2015-07-13 20:58:26 +03:00
|
|
|
|
# Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
|
|
|
|
|
# However, we need to remember the true starting places, so that we can
|
|
|
|
|
# realloc.
|
2021-01-06 04:50:17 +03:00
|
|
|
|
assert size + (PADDING*2) > 0
|
2015-07-13 20:58:26 +03:00
|
|
|
|
data_start = <TokenC*>self.mem.alloc(size + (PADDING*2), sizeof(TokenC))
|
|
|
|
|
cdef int i
|
|
|
|
|
for i in range(size + (PADDING*2)):
|
|
|
|
|
data_start[i].lex = &EMPTY_LEXEME
|
2015-09-09 04:39:46 +03:00
|
|
|
|
data_start[i].l_edge = i
|
|
|
|
|
data_start[i].r_edge = i
|
2015-11-03 16:15:14 +03:00
|
|
|
|
self.c = data_start + PADDING
|
2015-07-13 20:58:26 +03:00
|
|
|
|
self.max_length = size
|
|
|
|
|
self.length = 0
|
2016-10-19 21:54:03 +03:00
|
|
|
|
self.sentiment = 0.0
|
2017-07-22 01:34:15 +03:00
|
|
|
|
self.cats = {}
|
2016-10-19 22:15:16 +03:00
|
|
|
|
self.user_hooks = {}
|
|
|
|
|
self.user_token_hooks = {}
|
|
|
|
|
self.user_span_hooks = {}
|
2019-03-08 13:42:26 +03:00
|
|
|
|
self.tensor = numpy.zeros((0,), dtype="float32")
|
2017-10-17 17:11:13 +03:00
|
|
|
|
self.user_data = {} if user_data is None else user_data
|
2015-09-17 04:50:11 +03:00
|
|
|
|
self._vector = None
|
2020-07-22 23:18:46 +03:00
|
|
|
|
self.noun_chunks_iterator = self.vocab.get_noun_chunks
|
2016-09-21 15:52:05 +03:00
|
|
|
|
cdef bint has_space
|
2020-07-03 13:58:16 +03:00
|
|
|
|
if words is None and spaces is not None:
|
2020-10-04 12:16:31 +03:00
|
|
|
|
raise ValueError(Errors.E908)
|
2020-07-03 13:58:16 +03:00
|
|
|
|
elif spaces is None and words is not None:
|
|
|
|
|
self.has_unknown_spaces = True
|
|
|
|
|
else:
|
|
|
|
|
self.has_unknown_spaces = False
|
|
|
|
|
words = words if words is not None else []
|
|
|
|
|
spaces = spaces if spaces is not None else ([True] * len(words))
|
|
|
|
|
if len(spaces) != len(words):
|
|
|
|
|
raise ValueError(Errors.E027)
|
2020-06-26 20:34:12 +03:00
|
|
|
|
cdef const LexemeC* lexeme
|
2020-07-03 13:58:16 +03:00
|
|
|
|
for word, has_space in zip(words, spaces):
|
2021-09-13 18:02:17 +03:00
|
|
|
|
if isinstance(word, str):
|
2020-07-03 13:58:16 +03:00
|
|
|
|
lexeme = self.vocab.get(self.mem, word)
|
|
|
|
|
elif isinstance(word, bytes):
|
|
|
|
|
raise ValueError(Errors.E028.format(value=word))
|
|
|
|
|
else:
|
2021-10-29 13:08:40 +03:00
|
|
|
|
try:
|
|
|
|
|
lexeme = self.vocab.get_by_orth(self.mem, word)
|
|
|
|
|
except TypeError:
|
|
|
|
|
raise TypeError(Errors.E1022.format(wtype=type(word)))
|
2020-07-03 13:58:16 +03:00
|
|
|
|
self.push_back(lexeme, has_space)
|
2017-02-27 00:27:11 +03:00
|
|
|
|
|
2020-09-21 14:01:26 +03:00
|
|
|
|
if heads is not None:
|
2021-01-12 19:17:06 +03:00
|
|
|
|
heads = [head - i if head is not None else 0 for i, head in enumerate(heads)]
|
|
|
|
|
if deps is not None:
|
2021-01-13 16:20:05 +03:00
|
|
|
|
MISSING_DEP_ = self.vocab.strings[MISSING_DEP]
|
2021-01-12 19:17:06 +03:00
|
|
|
|
deps = [dep if dep is not None else MISSING_DEP_ for dep in deps]
|
2020-09-21 14:01:26 +03:00
|
|
|
|
if deps and not heads:
|
|
|
|
|
heads = [0] * len(deps)
|
2021-06-15 14:23:32 +03:00
|
|
|
|
if heads and not deps:
|
|
|
|
|
raise ValueError(Errors.E1017)
|
2020-09-21 18:59:09 +03:00
|
|
|
|
if sent_starts is not None:
|
|
|
|
|
for i in range(len(sent_starts)):
|
|
|
|
|
if sent_starts[i] is True:
|
|
|
|
|
sent_starts[i] = 1
|
|
|
|
|
elif sent_starts[i] is False:
|
|
|
|
|
sent_starts[i] = -1
|
|
|
|
|
elif sent_starts[i] is None or sent_starts[i] not in [-1, 0, 1]:
|
|
|
|
|
sent_starts[i] = 0
|
2021-09-16 14:28:05 +03:00
|
|
|
|
if pos is not None:
|
|
|
|
|
for pp in set(pos):
|
|
|
|
|
if pp not in parts_of_speech.IDS:
|
|
|
|
|
raise ValueError(Errors.E1021.format(pp=pp))
|
2020-10-01 17:22:18 +03:00
|
|
|
|
ent_iobs = None
|
|
|
|
|
ent_types = None
|
|
|
|
|
if ents is not None:
|
|
|
|
|
iob_strings = Token.iob_strings()
|
|
|
|
|
# make valid IOB2 out of IOB1 or IOB2
|
|
|
|
|
for i, ent in enumerate(ents):
|
|
|
|
|
if ent is "":
|
|
|
|
|
ents[i] = None
|
|
|
|
|
elif ent is not None and not isinstance(ent, str):
|
|
|
|
|
raise ValueError(Errors.E177.format(tag=ent))
|
|
|
|
|
if i < len(ents) - 1:
|
|
|
|
|
# OI -> OB
|
|
|
|
|
if (ent is None or ent.startswith("O")) and \
|
|
|
|
|
(ents[i+1] is not None and ents[i+1].startswith("I")):
|
|
|
|
|
ents[i+1] = "B" + ents[i+1][1:]
|
|
|
|
|
# B-TYPE1 I-TYPE2 or I-TYPE1 I-TYPE2 -> B/I-TYPE1 B-TYPE2
|
|
|
|
|
if ent is not None and ents[i+1] is not None and \
|
|
|
|
|
(ent.startswith("B") or ent.startswith("I")) and \
|
|
|
|
|
ents[i+1].startswith("I") and \
|
|
|
|
|
ent[1:] != ents[i+1][1:]:
|
|
|
|
|
ents[i+1] = "B" + ents[i+1][1:]
|
|
|
|
|
ent_iobs = []
|
|
|
|
|
ent_types = []
|
|
|
|
|
for ent in ents:
|
|
|
|
|
if ent is None:
|
|
|
|
|
ent_iobs.append(iob_strings.index(""))
|
|
|
|
|
ent_types.append("")
|
|
|
|
|
elif ent == "O":
|
|
|
|
|
ent_iobs.append(iob_strings.index(ent))
|
|
|
|
|
ent_types.append("")
|
|
|
|
|
else:
|
|
|
|
|
if len(ent) < 3 or ent[1] != "-":
|
|
|
|
|
raise ValueError(Errors.E177.format(tag=ent))
|
2020-10-04 12:16:31 +03:00
|
|
|
|
ent_iob, ent_type = ent.split("-", 1)
|
2020-10-01 17:22:18 +03:00
|
|
|
|
if ent_iob not in iob_strings:
|
|
|
|
|
raise ValueError(Errors.E177.format(tag=ent))
|
|
|
|
|
ent_iob = iob_strings.index(ent_iob)
|
|
|
|
|
ent_iobs.append(ent_iob)
|
|
|
|
|
ent_types.append(ent_type)
|
2020-09-21 14:01:26 +03:00
|
|
|
|
headings = []
|
|
|
|
|
values = []
|
2020-10-01 17:22:18 +03:00
|
|
|
|
annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts, ent_iobs, ent_types]
|
|
|
|
|
possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START, ENT_IOB, ENT_TYPE]
|
2020-09-21 14:01:26 +03:00
|
|
|
|
for a, annot in enumerate(annotations):
|
|
|
|
|
if annot is not None:
|
|
|
|
|
if len(annot) != len(words):
|
|
|
|
|
raise ValueError(Errors.E189)
|
|
|
|
|
headings.append(possible_headings[a])
|
2020-10-01 17:22:18 +03:00
|
|
|
|
if annot is not heads and annot is not sent_starts and annot is not ent_iobs:
|
2020-09-21 14:01:26 +03:00
|
|
|
|
values.extend(annot)
|
|
|
|
|
for value in values:
|
2021-01-07 21:10:32 +03:00
|
|
|
|
if value is not None:
|
|
|
|
|
self.vocab.strings.add(value)
|
2020-09-21 14:01:26 +03:00
|
|
|
|
|
|
|
|
|
# if there are any other annotations, set them
|
|
|
|
|
if headings:
|
|
|
|
|
attrs = self.to_array(headings)
|
|
|
|
|
|
|
|
|
|
j = 0
|
|
|
|
|
for annot in annotations:
|
|
|
|
|
if annot:
|
2020-10-01 17:22:18 +03:00
|
|
|
|
if annot is heads or annot is sent_starts or annot is ent_iobs:
|
2020-09-21 14:01:26 +03:00
|
|
|
|
for i in range(len(words)):
|
|
|
|
|
if attrs.ndim == 1:
|
2020-09-21 18:59:09 +03:00
|
|
|
|
attrs[i] = annot[i]
|
2020-09-21 14:01:26 +03:00
|
|
|
|
else:
|
2020-09-21 18:59:09 +03:00
|
|
|
|
attrs[i, j] = annot[i]
|
2020-09-21 14:01:26 +03:00
|
|
|
|
elif annot is morphs:
|
|
|
|
|
for i in range(len(words)):
|
|
|
|
|
morph_key = vocab.morphology.add(morphs[i])
|
|
|
|
|
if attrs.ndim == 1:
|
|
|
|
|
attrs[i] = morph_key
|
|
|
|
|
else:
|
|
|
|
|
attrs[i, j] = morph_key
|
|
|
|
|
else:
|
|
|
|
|
for i in range(len(words)):
|
|
|
|
|
if attrs.ndim == 1:
|
|
|
|
|
attrs[i] = self.vocab.strings[annot[i]]
|
|
|
|
|
else:
|
|
|
|
|
attrs[i, j] = self.vocab.strings[annot[i]]
|
|
|
|
|
j += 1
|
|
|
|
|
self.from_array(headings, attrs)
|
|
|
|
|
|
2017-10-07 19:56:01 +03:00
|
|
|
|
@property
|
|
|
|
|
def _(self):
|
2019-03-08 13:42:26 +03:00
|
|
|
|
"""Custom extension attributes registered via `set_extension`."""
|
2017-10-07 19:56:01 +03:00
|
|
|
|
return Underscore(Underscore.doc_extensions, self)
|
|
|
|
|
|
Add doc.retokenize() context manager (#2172)
This patch takes a step towards #1487 by introducing the
doc.retokenize() context manager, to handle merging spans, and soon
splitting tokens.
The idea is to do merging and splitting like this:
with doc.retokenize() as retokenizer:
for start, end, label in matches:
retokenizer.merge(doc[start : end], attrs={'ent_type': label})
The retokenizer accumulates the merge requests, and applies them
together at the end of the block. This will allow retokenization to be
more efficient, and much less error prone.
A retokenizer.split() function will then be added, to handle splitting a
single token into multiple tokens. These methods take `Span` and `Token`
objects; if the user wants to go directly from offsets, they can append
to the .merges and .splits lists on the retokenizer.
The doc.merge() method's behaviour remains unchanged, so this patch
should be 100% backwards incompatible (modulo bugs). Internally,
doc.merge() fixes up the arguments (to handle the various deprecated styles),
opens the retokenizer, and makes the single merge.
We can later start making deprecation warnings on direct calls to doc.merge(),
to migrate people to use of the retokenize context manager.
2018-04-03 15:10:35 +03:00
|
|
|
|
@property
|
2020-09-17 01:14:01 +03:00
|
|
|
|
def is_tagged(self):
|
|
|
|
|
warnings.warn(Warnings.W107.format(prop="is_tagged", attr="TAG"), DeprecationWarning)
|
|
|
|
|
return self.has_annotation("TAG")
|
2019-02-27 13:17:17 +03:00
|
|
|
|
|
2020-09-17 01:14:01 +03:00
|
|
|
|
@property
|
|
|
|
|
def is_parsed(self):
|
|
|
|
|
warnings.warn(Warnings.W107.format(prop="is_parsed", attr="DEP"), DeprecationWarning)
|
|
|
|
|
return self.has_annotation("DEP")
|
2019-03-10 17:24:34 +03:00
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def is_nered(self):
|
2020-09-17 01:14:01 +03:00
|
|
|
|
warnings.warn(Warnings.W107.format(prop="is_nered", attr="ENT_IOB"), DeprecationWarning)
|
|
|
|
|
return self.has_annotation("ENT_IOB")
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def is_sentenced(self):
|
|
|
|
|
warnings.warn(Warnings.W107.format(prop="is_sentenced", attr="SENT_START"), DeprecationWarning)
|
|
|
|
|
return self.has_annotation("SENT_START")
|
|
|
|
|
|
|
|
|
|
def has_annotation(self, attr, *, require_complete=False):
|
|
|
|
|
"""Check whether the doc contains annotation on a token attribute.
|
|
|
|
|
|
|
|
|
|
attr (Union[int, str]): The attribute string name or int ID.
|
|
|
|
|
require_complete (bool): Whether to check that the attribute is set on
|
|
|
|
|
every token in the doc.
|
|
|
|
|
RETURNS (bool): Whether annotation is present.
|
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
|
DOCS: https://spacy.io/api/doc#has_annotation
|
2019-03-10 17:24:34 +03:00
|
|
|
|
"""
|
2020-09-17 01:14:01 +03:00
|
|
|
|
|
|
|
|
|
# empty docs are always annotated
|
|
|
|
|
if self.length == 0:
|
2019-07-10 20:21:23 +03:00
|
|
|
|
return True
|
2020-09-17 01:14:01 +03:00
|
|
|
|
cdef int i
|
|
|
|
|
cdef int range_start = 0
|
2020-11-03 17:47:18 +03:00
|
|
|
|
if attr == "IS_SENT_START" or attr == self.vocab.strings["IS_SENT_START"]:
|
|
|
|
|
attr = SENT_START
|
2022-02-08 10:35:37 +03:00
|
|
|
|
elif attr == "IS_SENT_END" or attr == self.vocab.strings["IS_SENT_END"]:
|
|
|
|
|
attr = SENT_START
|
2020-09-17 01:14:01 +03:00
|
|
|
|
attr = intify_attr(attr)
|
|
|
|
|
# adjust attributes
|
|
|
|
|
if attr == HEAD:
|
|
|
|
|
# HEAD does not have an unset state, so rely on DEP
|
|
|
|
|
attr = DEP
|
|
|
|
|
# special cases for sentence boundaries
|
|
|
|
|
if attr == SENT_START:
|
|
|
|
|
if "sents" in self.user_hooks:
|
|
|
|
|
return True
|
|
|
|
|
# docs of length 1 always have sentence boundaries
|
|
|
|
|
if self.length == 1:
|
2019-03-10 17:24:34 +03:00
|
|
|
|
return True
|
2020-09-17 01:14:01 +03:00
|
|
|
|
range_start = 1
|
|
|
|
|
if require_complete:
|
|
|
|
|
return all(Token.get_struct_attr(&self.c[i], attr) for i in range(range_start, self.length))
|
|
|
|
|
else:
|
|
|
|
|
return any(Token.get_struct_attr(&self.c[i], attr) for i in range(range_start, self.length))
|
Add doc.retokenize() context manager (#2172)
This patch takes a step towards #1487 by introducing the
doc.retokenize() context manager, to handle merging spans, and soon
splitting tokens.
The idea is to do merging and splitting like this:
with doc.retokenize() as retokenizer:
for start, end, label in matches:
retokenizer.merge(doc[start : end], attrs={'ent_type': label})
The retokenizer accumulates the merge requests, and applies them
together at the end of the block. This will allow retokenization to be
more efficient, and much less error prone.
A retokenizer.split() function will then be added, to handle splitting a
single token into multiple tokens. These methods take `Span` and `Token`
objects; if the user wants to go directly from offsets, they can append
to the .merges and .splits lists on the retokenizer.
The doc.merge() method's behaviour remains unchanged, so this patch
should be 100% backwards incompatible (modulo bugs). Internally,
doc.merge() fixes up the arguments (to handle the various deprecated styles),
opens the retokenizer, and makes the single merge.
We can later start making deprecation warnings on direct calls to doc.merge(),
to migrate people to use of the retokenize context manager.
2018-04-03 15:10:35 +03:00
|
|
|
|
|
2015-07-13 20:58:26 +03:00
|
|
|
|
def __getitem__(self, object i):
|
2017-05-18 23:17:09 +03:00
|
|
|
|
"""Get a `Token` or `Span` object.
|
|
|
|
|
|
2017-10-27 16:41:45 +03:00
|
|
|
|
i (int or tuple) The index of the token, or the slice of the document
|
|
|
|
|
to get.
|
2017-05-19 01:30:51 +03:00
|
|
|
|
RETURNS (Token or Span): The token at `doc[i]]`, or the span at
|
|
|
|
|
`doc[start : end]`.
|
|
|
|
|
|
2017-05-18 23:17:09 +03:00
|
|
|
|
EXAMPLE:
|
|
|
|
|
>>> doc[i]
|
|
|
|
|
Get the `Token` object at position `i`, where `i` is an integer.
|
2017-02-27 00:27:11 +03:00
|
|
|
|
Negative indexing is supported, and follows the usual Python
|
2017-05-18 23:17:09 +03:00
|
|
|
|
semantics, i.e. `doc[-2]` is `doc[len(doc) - 2]`.
|
|
|
|
|
|
|
|
|
|
>>> doc[start : end]]
|
|
|
|
|
Get a `Span` object, starting at position `start` and ending at
|
|
|
|
|
position `end`, where `start` and `end` are token indices. For
|
2017-10-27 16:41:45 +03:00
|
|
|
|
instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and
|
|
|
|
|
4. Stepped slices (e.g. `doc[start : end : step]`) are not
|
|
|
|
|
supported, as `Span` objects must be contiguous (cannot have gaps).
|
|
|
|
|
You can use negative indices and open-ended ranges, which have
|
|
|
|
|
their normal Python semantics.
|
2019-03-08 13:42:26 +03:00
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
|
DOCS: https://spacy.io/api/doc#getitem
|
2017-04-15 14:05:15 +03:00
|
|
|
|
"""
|
2015-07-13 20:58:26 +03:00
|
|
|
|
if isinstance(i, slice):
|
2020-09-16 21:32:38 +03:00
|
|
|
|
start, stop = util.normalize_slice(len(self), i.start, i.stop, i.step)
|
2015-10-07 11:25:35 +03:00
|
|
|
|
return Span(self, start, stop, label=0)
|
2015-07-13 20:58:26 +03:00
|
|
|
|
if i < 0:
|
|
|
|
|
i = self.length + i
|
|
|
|
|
bounds_check(i, self.length, PADDING)
|
2017-10-16 20:34:21 +03:00
|
|
|
|
return Token.cinit(self.vocab, &self.c[i], i, self)
|
2015-07-13 20:58:26 +03:00
|
|
|
|
|
|
|
|
|
def __iter__(self):
|
2017-05-18 23:17:09 +03:00
|
|
|
|
"""Iterate over `Token` objects, from which the annotations can be
|
|
|
|
|
easily accessed. This is the main way of accessing `Token` objects,
|
|
|
|
|
which are the main way annotations are accessed from Python. If faster-
|
|
|
|
|
than-Python speeds are required, you can instead access the annotations
|
|
|
|
|
as a numpy array, or access the underlying C data directly from Cython.
|
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
|
DOCS: https://spacy.io/api/doc#iter
|
2017-04-15 14:05:15 +03:00
|
|
|
|
"""
|
2015-07-18 05:10:53 +03:00
|
|
|
|
cdef int i
|
2015-07-13 20:58:26 +03:00
|
|
|
|
for i in range(self.length):
|
2017-10-16 20:34:21 +03:00
|
|
|
|
yield Token.cinit(self.vocab, &self.c[i], i, self)
|
2015-07-13 20:58:26 +03:00
|
|
|
|
|
|
|
|
|
def __len__(self):
|
2017-05-18 23:17:09 +03:00
|
|
|
|
"""The number of tokens in the document.
|
|
|
|
|
|
2017-05-19 19:47:39 +03:00
|
|
|
|
RETURNS (int): The number of tokens in the document.
|
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
|
DOCS: https://spacy.io/api/doc#len
|
2017-04-15 14:05:15 +03:00
|
|
|
|
"""
|
2015-07-13 20:58:26 +03:00
|
|
|
|
return self.length
|
|
|
|
|
|
|
|
|
|
def __unicode__(self):
|
2019-03-08 13:42:26 +03:00
|
|
|
|
return "".join([t.text_with_ws for t in self])
|
2015-07-13 20:58:26 +03:00
|
|
|
|
|
2015-11-02 21:22:18 +03:00
|
|
|
|
def __bytes__(self):
|
2019-03-08 13:42:26 +03:00
|
|
|
|
return "".join([t.text_with_ws for t in self]).encode("utf-8")
|
2015-11-02 21:22:18 +03:00
|
|
|
|
|
2015-07-24 04:49:30 +03:00
|
|
|
|
def __str__(self):
|
2019-12-22 03:53:56 +03:00
|
|
|
|
return self.__unicode__()
|
2015-07-24 04:49:30 +03:00
|
|
|
|
|
2015-10-21 14:11:46 +03:00
|
|
|
|
def __repr__(self):
|
2015-11-02 21:22:18 +03:00
|
|
|
|
return self.__str__()
|
2015-10-21 14:11:46 +03:00
|
|
|
|
|
2016-11-24 13:47:20 +03:00
|
|
|
|
@property
|
|
|
|
|
def doc(self):
|
|
|
|
|
return self
|
|
|
|
|
|
2020-08-04 14:36:32 +03:00
|
|
|
|
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict"):
|
|
|
|
|
"""Create a `Span` object from the slice
|
|
|
|
|
`doc.text[start_idx : end_idx]`. Returns None if no valid `Span` can be
|
|
|
|
|
created.
|
2017-08-19 13:21:09 +03:00
|
|
|
|
|
|
|
|
|
doc (Doc): The parent document.
|
2020-08-04 14:36:32 +03:00
|
|
|
|
start_idx (int): The index of the first character of the span.
|
|
|
|
|
end_idx (int): The index of the first character after the span.
|
2017-10-27 16:41:45 +03:00
|
|
|
|
label (uint64 or string): A label to attach to the Span, e.g. for
|
|
|
|
|
named entities.
|
2020-08-04 14:36:32 +03:00
|
|
|
|
kb_id (uint64 or string): An ID from a KB to capture the meaning of a
|
|
|
|
|
named entity.
|
2017-10-27 16:41:45 +03:00
|
|
|
|
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
|
|
|
|
|
the span.
|
2020-08-04 14:36:32 +03:00
|
|
|
|
alignment_mode (str): How character indices are aligned to token
|
|
|
|
|
boundaries. Options: "strict" (character indices must be aligned
|
|
|
|
|
with token boundaries), "contract" (span of all tokens completely
|
|
|
|
|
within the character span), "expand" (span of all tokens at least
|
|
|
|
|
partially covered by the character span). Defaults to "strict".
|
2017-08-19 13:21:09 +03:00
|
|
|
|
RETURNS (Span): The newly constructed object.
|
2019-03-08 13:42:26 +03:00
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
|
DOCS: https://spacy.io/api/doc#char_span
|
2017-08-19 13:21:09 +03:00
|
|
|
|
"""
|
2017-08-19 17:18:09 +03:00
|
|
|
|
if not isinstance(label, int):
|
|
|
|
|
label = self.vocab.strings.add(label)
|
2019-03-14 17:48:40 +03:00
|
|
|
|
if not isinstance(kb_id, int):
|
|
|
|
|
kb_id = self.vocab.strings.add(kb_id)
|
2021-01-27 15:40:42 +03:00
|
|
|
|
alignment_modes = ("strict", "contract", "expand")
|
|
|
|
|
if alignment_mode not in alignment_modes:
|
2021-10-27 15:08:31 +03:00
|
|
|
|
raise ValueError(
|
|
|
|
|
Errors.E202.format(
|
|
|
|
|
name="alignment",
|
|
|
|
|
mode=alignment_mode,
|
|
|
|
|
modes=", ".join(alignment_modes),
|
|
|
|
|
)
|
|
|
|
|
)
|
2020-08-04 14:36:32 +03:00
|
|
|
|
cdef int start = token_by_char(self.c, self.length, start_idx)
|
|
|
|
|
if start < 0 or (alignment_mode == "strict" and start_idx != self[start].idx):
|
2017-08-19 13:21:09 +03:00
|
|
|
|
return None
|
2020-08-04 14:36:32 +03:00
|
|
|
|
# end_idx is exclusive, so find the token at one char before
|
|
|
|
|
cdef int end = token_by_char(self.c, self.length, end_idx - 1)
|
|
|
|
|
if end < 0 or (alignment_mode == "strict" and end_idx != self[end].idx + len(self[end])):
|
2017-08-19 13:21:09 +03:00
|
|
|
|
return None
|
2020-08-04 14:36:32 +03:00
|
|
|
|
# Adjust start and end by alignment_mode
|
|
|
|
|
if alignment_mode == "contract":
|
|
|
|
|
if self[start].idx < start_idx:
|
|
|
|
|
start += 1
|
|
|
|
|
if end_idx < self[end].idx + len(self[end]):
|
|
|
|
|
end -= 1
|
|
|
|
|
# if no tokens are completely within the span, return None
|
|
|
|
|
if end < start:
|
|
|
|
|
return None
|
|
|
|
|
elif alignment_mode == "expand":
|
|
|
|
|
# Don't consider the trailing whitespace to be part of the previous
|
|
|
|
|
# token
|
|
|
|
|
if start_idx == self[start].idx + len(self[start]):
|
|
|
|
|
start += 1
|
2017-08-19 13:21:09 +03:00
|
|
|
|
# Currently we have the token index, we want the range-end index
|
|
|
|
|
end += 1
|
2019-03-14 17:48:40 +03:00
|
|
|
|
cdef Span span = Span(self, start, end, label=label, kb_id=kb_id, vector=vector)
|
2017-08-19 13:21:09 +03:00
|
|
|
|
return span
|
|
|
|
|
|
2015-09-14 10:49:58 +03:00
|
|
|
|
def similarity(self, other):
|
2017-05-18 23:17:09 +03:00
|
|
|
|
"""Make a semantic similarity estimate. The default estimate is cosine
|
2016-11-01 14:25:36 +03:00
|
|
|
|
similarity using an average of word vectors.
|
|
|
|
|
|
2017-05-18 23:17:09 +03:00
|
|
|
|
other (object): The object to compare with. By default, accepts `Doc`,
|
|
|
|
|
`Span`, `Token` and `Lexeme` objects.
|
|
|
|
|
RETURNS (float): A scalar similarity score. Higher is more similar.
|
2019-03-08 13:42:26 +03:00
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
|
DOCS: https://spacy.io/api/doc#similarity
|
2017-04-15 14:05:15 +03:00
|
|
|
|
"""
|
2019-03-08 13:42:26 +03:00
|
|
|
|
if "similarity" in self.user_hooks:
|
|
|
|
|
return self.user_hooks["similarity"](self, other)
|
2018-01-15 18:29:48 +03:00
|
|
|
|
if isinstance(other, (Lexeme, Token)) and self.length == 1:
|
|
|
|
|
if self.c[0].lex.orth == other.orth:
|
|
|
|
|
return 1.0
|
2020-04-27 17:51:27 +03:00
|
|
|
|
elif isinstance(other, (Span, Doc)) and len(self) == len(other):
|
|
|
|
|
similar = True
|
|
|
|
|
for i in range(self.length):
|
|
|
|
|
if self[i].orth != other[i].orth:
|
|
|
|
|
similar = False
|
|
|
|
|
break
|
|
|
|
|
if similar:
|
|
|
|
|
return 1.0
|
2018-05-21 02:22:38 +03:00
|
|
|
|
if self.vocab.vectors.n_keys == 0:
|
2020-04-28 14:37:37 +03:00
|
|
|
|
warnings.warn(Warnings.W007.format(obj="Doc"))
|
2015-09-22 03:10:01 +03:00
|
|
|
|
if self.vector_norm == 0 or other.vector_norm == 0:
|
2020-04-28 14:37:37 +03:00
|
|
|
|
warnings.warn(Warnings.W008.format(obj="Doc"))
|
2015-09-22 03:10:01 +03:00
|
|
|
|
return 0.0
|
2019-03-07 01:58:38 +03:00
|
|
|
|
vector = self.vector
|
|
|
|
|
xp = get_array_module(vector)
|
2020-02-11 04:31:49 +03:00
|
|
|
|
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
|
|
|
|
|
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
|
|
|
|
|
return result.item()
|
2015-09-14 10:49:58 +03:00
|
|
|
|
|
2019-03-11 17:59:09 +03:00
|
|
|
|
@property
|
|
|
|
|
def has_vector(self):
|
2017-05-18 23:17:09 +03:00
|
|
|
|
"""A boolean value indicating whether a word vector is associated with
|
|
|
|
|
the object.
|
|
|
|
|
|
|
|
|
|
RETURNS (bool): Whether a word vector is associated with the object.
|
2019-03-08 13:42:26 +03:00
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
|
DOCS: https://spacy.io/api/doc#has_vector
|
2017-04-15 14:05:15 +03:00
|
|
|
|
"""
|
2019-03-11 17:59:09 +03:00
|
|
|
|
if "has_vector" in self.user_hooks:
|
|
|
|
|
return self.user_hooks["has_vector"](self)
|
2022-01-18 19:14:35 +03:00
|
|
|
|
elif self.vocab.vectors.size:
|
2019-03-11 17:59:09 +03:00
|
|
|
|
return True
|
|
|
|
|
elif self.tensor.size:
|
|
|
|
|
return True
|
|
|
|
|
else:
|
|
|
|
|
return False
|
2016-05-09 13:36:14 +03:00
|
|
|
|
|
2015-09-14 10:49:58 +03:00
|
|
|
|
property vector:
|
2017-05-18 23:17:09 +03:00
|
|
|
|
"""A real-valued meaning representation. Defaults to an average of the
|
|
|
|
|
token vectors.
|
2017-02-27 00:27:11 +03:00
|
|
|
|
|
2017-05-18 23:17:09 +03:00
|
|
|
|
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
|
|
|
|
|
representing the document's semantics.
|
2019-03-08 13:42:26 +03:00
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
|
DOCS: https://spacy.io/api/doc#vector
|
2017-04-15 14:05:15 +03:00
|
|
|
|
"""
|
2015-09-14 10:49:58 +03:00
|
|
|
|
def __get__(self):
|
2019-03-08 13:42:26 +03:00
|
|
|
|
if "vector" in self.user_hooks:
|
|
|
|
|
return self.user_hooks["vector"](self)
|
2017-05-31 00:35:17 +03:00
|
|
|
|
if self._vector is not None:
|
|
|
|
|
return self._vector
|
2019-03-20 14:09:59 +03:00
|
|
|
|
xp = get_array_module(self.vocab.vectors.data)
|
|
|
|
|
if not len(self):
|
|
|
|
|
self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f")
|
2017-08-22 20:52:19 +03:00
|
|
|
|
return self._vector
|
2022-01-18 19:14:35 +03:00
|
|
|
|
elif self.vocab.vectors.size > 0:
|
2019-03-07 01:58:38 +03:00
|
|
|
|
self._vector = sum(t.vector for t in self) / len(self)
|
2017-05-31 00:35:17 +03:00
|
|
|
|
return self._vector
|
2017-11-03 22:56:33 +03:00
|
|
|
|
elif self.tensor.size > 0:
|
2017-05-31 00:35:17 +03:00
|
|
|
|
self._vector = self.tensor.mean(axis=0)
|
|
|
|
|
return self._vector
|
|
|
|
|
else:
|
2019-03-20 14:09:59 +03:00
|
|
|
|
return xp.zeros((self.vocab.vectors_length,), dtype="float32")
|
2015-09-14 10:49:58 +03:00
|
|
|
|
|
2015-09-17 04:50:11 +03:00
|
|
|
|
def __set__(self, value):
|
|
|
|
|
self._vector = value
|
2015-09-14 10:49:58 +03:00
|
|
|
|
|
|
|
|
|
property vector_norm:
|
2017-05-19 00:59:44 +03:00
|
|
|
|
"""The L2 norm of the document's vector representation.
|
|
|
|
|
|
|
|
|
|
RETURNS (float): The L2 norm of the vector representation.
|
2019-03-08 13:42:26 +03:00
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
|
DOCS: https://spacy.io/api/doc#vector_norm
|
2017-05-19 00:59:44 +03:00
|
|
|
|
"""
|
2015-09-14 10:49:58 +03:00
|
|
|
|
def __get__(self):
|
2019-03-08 13:42:26 +03:00
|
|
|
|
if "vector_norm" in self.user_hooks:
|
|
|
|
|
return self.user_hooks["vector_norm"](self)
|
2015-09-17 04:50:11 +03:00
|
|
|
|
cdef float value
|
2016-10-23 15:49:31 +03:00
|
|
|
|
cdef double norm = 0
|
2015-09-17 04:50:11 +03:00
|
|
|
|
if self._vector_norm is None:
|
2016-10-23 15:49:31 +03:00
|
|
|
|
norm = 0.0
|
2015-09-17 04:50:11 +03:00
|
|
|
|
for value in self.vector:
|
2016-10-23 15:49:31 +03:00
|
|
|
|
norm += value * value
|
|
|
|
|
self._vector_norm = sqrt(norm) if norm != 0 else 0
|
2015-09-17 04:50:11 +03:00
|
|
|
|
return self._vector_norm
|
2017-02-27 00:27:11 +03:00
|
|
|
|
|
2015-09-17 04:50:11 +03:00
|
|
|
|
def __set__(self, value):
|
2017-02-27 00:27:11 +03:00
|
|
|
|
self._vector_norm = value
|
2015-09-14 10:49:58 +03:00
|
|
|
|
|
2019-03-11 17:59:09 +03:00
|
|
|
|
@property
|
|
|
|
|
def text(self):
|
2017-05-18 23:17:09 +03:00
|
|
|
|
"""A unicode representation of the document text.
|
|
|
|
|
|
2020-05-24 18:20:58 +03:00
|
|
|
|
RETURNS (str): The original verbatim text of the document.
|
2017-04-15 14:05:15 +03:00
|
|
|
|
"""
|
2019-03-11 17:59:09 +03:00
|
|
|
|
return "".join(t.text_with_ws for t in self)
|
2015-07-13 20:58:26 +03:00
|
|
|
|
|
2019-03-11 17:59:09 +03:00
|
|
|
|
@property
|
|
|
|
|
def text_with_ws(self):
|
2017-05-18 23:17:09 +03:00
|
|
|
|
"""An alias of `Doc.text`, provided for duck-type compatibility with
|
|
|
|
|
`Span` and `Token`.
|
|
|
|
|
|
2020-05-24 18:20:58 +03:00
|
|
|
|
RETURNS (str): The original verbatim text of the document.
|
2017-04-15 14:05:15 +03:00
|
|
|
|
"""
|
2019-03-11 17:59:09 +03:00
|
|
|
|
return self.text
|
2015-09-13 03:27:42 +03:00
|
|
|
|
|
2015-08-06 01:35:40 +03:00
|
|
|
|
property ents:
|
2019-03-08 13:42:26 +03:00
|
|
|
|
"""The named entities in the document. Returns a tuple of named entity
|
|
|
|
|
`Span` objects, if the entity recognizer has been applied.
|
2017-05-18 23:17:09 +03:00
|
|
|
|
|
2019-03-08 13:42:26 +03:00
|
|
|
|
RETURNS (tuple): Entities in the document, one `Span` per entity.
|
2017-05-18 23:17:09 +03:00
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
|
DOCS: https://spacy.io/api/doc#ents
|
2017-04-15 14:05:15 +03:00
|
|
|
|
"""
|
2015-08-06 01:35:40 +03:00
|
|
|
|
def __get__(self):
|
|
|
|
|
cdef int i
|
|
|
|
|
cdef const TokenC* token
|
|
|
|
|
cdef int start = -1
|
2017-05-28 19:09:27 +03:00
|
|
|
|
cdef attr_t label = 0
|
2019-03-14 17:48:40 +03:00
|
|
|
|
cdef attr_t kb_id = 0
|
2015-08-06 01:35:40 +03:00
|
|
|
|
output = []
|
|
|
|
|
for i in range(self.length):
|
2015-11-03 16:15:14 +03:00
|
|
|
|
token = &self.c[i]
|
2015-08-06 01:35:40 +03:00
|
|
|
|
if token.ent_iob == 1:
|
2018-03-26 08:13:34 +03:00
|
|
|
|
if start == -1:
|
2019-12-25 19:59:52 +03:00
|
|
|
|
seq = [f"{t.text}|{t.ent_iob_}" for t in self[i-5:i+5]]
|
2019-03-08 13:42:26 +03:00
|
|
|
|
raise ValueError(Errors.E093.format(seq=" ".join(seq)))
|
2020-06-26 20:34:12 +03:00
|
|
|
|
elif token.ent_iob == 2 or token.ent_iob == 0 or \
|
|
|
|
|
(token.ent_iob == 3 and token.ent_type == 0):
|
2015-08-06 01:35:40 +03:00
|
|
|
|
if start != -1:
|
2019-03-14 17:48:40 +03:00
|
|
|
|
output.append(Span(self, start, i, label=label, kb_id=kb_id))
|
2015-08-06 01:35:40 +03:00
|
|
|
|
start = -1
|
|
|
|
|
label = 0
|
2019-03-14 17:48:40 +03:00
|
|
|
|
kb_id = 0
|
2015-08-06 01:35:40 +03:00
|
|
|
|
elif token.ent_iob == 3:
|
|
|
|
|
if start != -1:
|
2019-03-14 17:48:40 +03:00
|
|
|
|
output.append(Span(self, start, i, label=label, kb_id=kb_id))
|
2015-08-06 01:35:40 +03:00
|
|
|
|
start = i
|
|
|
|
|
label = token.ent_type
|
2019-03-14 17:48:40 +03:00
|
|
|
|
kb_id = token.ent_kb_id
|
2015-08-06 01:35:40 +03:00
|
|
|
|
if start != -1:
|
2019-03-14 17:48:40 +03:00
|
|
|
|
output.append(Span(self, start, self.length, label=label, kb_id=kb_id))
|
2020-06-26 20:34:12 +03:00
|
|
|
|
# remove empty-label spans
|
|
|
|
|
output = [o for o in output if o.label_ != ""]
|
2015-08-06 01:35:40 +03:00
|
|
|
|
return tuple(output)
|
|
|
|
|
|
|
|
|
|
def __set__(self, ents):
|
|
|
|
|
# TODO:
|
2019-09-18 22:37:17 +03:00
|
|
|
|
# 1. Test basic data-driven ORTH gazetteer
|
|
|
|
|
# 2. Test more nuanced date and currency regex
|
2020-09-24 13:36:51 +03:00
|
|
|
|
cdef attr_t entity_type, kb_id
|
|
|
|
|
cdef int ent_start, ent_end
|
|
|
|
|
ent_spans = []
|
💫 Port master changes over to develop (#2979)
* Create aryaprabhudesai.md (#2681)
* Update _install.jade (#2688)
Typo fix: "models" -> "model"
* Add FAC to spacy.explain (resolves #2706)
* Remove docstrings for deprecated arguments (see #2703)
* When calling getoption() in conftest.py, pass a default option (#2709)
* When calling getoption() in conftest.py, pass a default option
This is necessary to allow testing an installed spacy by running:
pytest --pyargs spacy
* Add contributor agreement
* update bengali token rules for hyphen and digits (#2731)
* Less norm computations in token similarity (#2730)
* Less norm computations in token similarity
* Contributor agreement
* Remove ')' for clarity (#2737)
Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.
* added contributor agreement for mbkupfer (#2738)
* Basic support for Telugu language (#2751)
* Lex _attrs for polish language (#2750)
* Signed spaCy contributor agreement
* Added polish version of english lex_attrs
* Introduces a bulk merge function, in order to solve issue #653 (#2696)
* Fix comment
* Introduce bulk merge to increase performance on many span merges
* Sign contributor agreement
* Implement pull request suggestions
* Describe converters more explicitly (see #2643)
* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]
* Fix formatting
* Fix dependency scheme docs (closes #2705) [ci skip]
* Don't set stop word in example (closes #2657) [ci skip]
* Add words to portuguese language _num_words (#2759)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Update Indonesian model (#2752)
* adding e-KTP in tokenizer exceptions list
* add exception token
* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception
* add tokenizer exceptions list
* combining base_norms with norm_exceptions
* adding norm_exception
* fix double key in lemmatizer
* remove unused import on punctuation.py
* reformat stop_words to reduce number of lines, improve readibility
* updating tokenizer exception
* implement is_currency for lang/id
* adding orth_first_upper in tokenizer_exceptions
* update the norm_exception list
* remove bunch of abbreviations
* adding contributors file
* Fixed spaCy+Keras example (#2763)
* bug fixes in keras example
* created contributor agreement
* Adding French hyphenated first name (#2786)
* Fix typo (closes #2784)
* Fix typo (#2795) [ci skip]
Fixed typo on line 6 "regcognizer --> recognizer"
* Adding basic support for Sinhala language. (#2788)
* adding Sinhala language package, stop words, examples and lex_attrs.
* Adding contributor agreement
* Updating contributor agreement
* Also include lowercase norm exceptions
* Fix error (#2802)
* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way. Use the resize function
* added spaCy Contributor Agreement
* Add charlax's contributor agreement (#2805)
* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)
* Contributors agreement
* Contributors agreement
* Contributors agreement
* Add jupyter=True to displacy.render in documentation (#2806)
* Revert "Also include lowercase norm exceptions"
This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.
* Remove deprecated encoding argument to msgpack
* Set up dependency tree pattern matching skeleton (#2732)
* Fix bug when too many entity types. Fixes #2800
* Fix Python 2 test failure
* Require older msgpack-numpy
* Restore encoding arg on msgpack-numpy
* Try to fix version pin for msgpack-numpy
* Update Portuguese Language (#2790)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols
* Extended punctuation and norm_exceptions in the Portuguese language
* Correct error in spacy universe docs concerning spacy-lookup (#2814)
* Update Keras Example for (Parikh et al, 2016) implementation (#2803)
* bug fixes in keras example
* created contributor agreement
* baseline for Parikh model
* initial version of parikh 2016 implemented
* tested asymmetric models
* fixed grevious error in normalization
* use standard SNLI test file
* begin to rework parikh example
* initial version of running example
* start to document the new version
* start to document the new version
* Update Decompositional Attention.ipynb
* fixed calls to similarity
* updated the README
* import sys package duh
* simplified indexing on mapping word to IDs
* stupid python indent error
* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround
* Fix typo (closes #2815) [ci skip]
* Update regex version dependency
* Set version to 2.0.13.dev3
* Skip seemingly problematic test
* Remove problematic test
* Try previous version of regex
* Revert "Remove problematic test"
This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.
* Unskip test
* Try older version of regex
* 💫 Update training examples and use minibatching (#2830)
<!--- Provide a general summary of your changes in the title. -->
## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.
### Types of change
enhancements
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Visual C++ link updated (#2842) (closes #2841) [ci skip]
* New landing page
* Add contribution agreement
* Correcting lang/ru/examples.py (#2845)
* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement
* Correct some grammatical inaccuracies in lang\ru\examples.py
* Move contributor agreement to separate file
* Set version to 2.0.13.dev4
* Add Persian(Farsi) language support (#2797)
* Also include lowercase norm exceptions
* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors
* Rule-based French Lemmatizer (#2818)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Set version to 2.0.13
* Fix formatting and consistency
* Update docs for new version [ci skip]
* Increment version [ci skip]
* Add info on wheels [ci skip]
* Adding "This is a sentence" example to Sinhala (#2846)
* Add wheels badge
* Update badge [ci skip]
* Update README.rst [ci skip]
* Update murmurhash pin
* Increment version to 2.0.14.dev0
* Update GPU docs for v2.0.14
* Add wheel to setup_requires
* Import prefer_gpu and require_gpu functions from Thinc
* Add tests for prefer_gpu() and require_gpu()
* Update requirements and setup.py
* Workaround bug in thinc require_gpu
* Set version to v2.0.14
* Update push-tag script
* Unhack prefer_gpu
* Require thinc 6.10.6
* Update prefer_gpu and require_gpu docs [ci skip]
* Fix specifiers for GPU
* Set version to 2.0.14.dev1
* Set version to 2.0.14
* Update Thinc version pin
* Increment version
* Fix msgpack-numpy version pin
* Increment version
* Update version to 2.0.16
* Update version [ci skip]
* Redundant ')' in the Stop words' example (#2856)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Documentation improvement regarding joblib and SO (#2867)
Some documentation improvements
## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)
### Types of change
Documentation
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* raise error when setting overlapping entities as doc.ents (#2880)
* Fix out-of-bounds access in NER training
The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!
This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.
* Change PyThaiNLP Url (#2876)
* Fix missing comma
* Add example showing a fix-up rule for space entities
* Set version to 2.0.17.dev0
* Update regex version
* Revert "Update regex version"
This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.
* Try setting older regex version, to align with conda
* Set version to 2.0.17
* Add spacy-js to universe [ci-skip]
* Add spacy-raspberry to universe (closes #2889)
* Add script to validate universe json [ci skip]
* Removed space in docs + added contributor indo (#2909)
* - removed unneeded space in documentation
* - added contributor info
* Allow input text of length up to max_length, inclusive (#2922)
* Include universe spec for spacy-wordnet component (#2919)
* feat: include universe spec for spacy-wordnet component
* chore: include spaCy contributor agreement
* Minor formatting changes [ci skip]
* Fix image [ci skip]
Twitter URL doesn't work on live site
* Check if the word is in one of the regular lists specific to each POS (#2886)
* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)
Resolves #2924.
## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)
### Types of change
bug fix
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix typo [ci skip]
* fixes symbolic link on py3 and windows (#2949)
* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948
* Update spacy/compat.py
Co-Authored-By: cicorias <cicorias@users.noreply.github.com>
* Fix formatting
* Update universe [ci skip]
* Catalan Language Support (#2940)
* Catalan language Support
* Ddding Catalan to documentation
* Sort languages alphabetically [ci skip]
* Update tests for pytest 4.x (#2965)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix regex pin to harmonize with conda (#2964)
* Update README.rst
* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)
Fixes #2976
* Fix typo
* Fix typo
* Remove duplicate file
* Require thinc 7.0.0.dev2
Fixes bug in gpu_ops that would use cupy instead of numpy on CPU
* Add missing import
* Fix error IDs
* Fix tests
2018-11-29 18:30:29 +03:00
|
|
|
|
for ent_info in ents:
|
2020-09-21 23:58:03 +03:00
|
|
|
|
entity_type_, kb_id, ent_start, ent_end = get_entity_info(ent_info)
|
|
|
|
|
if isinstance(entity_type_, str):
|
|
|
|
|
self.vocab.strings.add(entity_type_)
|
2020-09-24 13:36:51 +03:00
|
|
|
|
span = Span(self, ent_start, ent_end, label=entity_type_, kb_id=kb_id)
|
|
|
|
|
ent_spans.append(span)
|
|
|
|
|
self.set_ents(ent_spans, default=SetEntsDefault.outside)
|
2015-07-13 20:58:26 +03:00
|
|
|
|
|
2020-09-21 16:54:05 +03:00
|
|
|
|
def set_ents(self, entities, *, blocked=None, missing=None, outside=None, default=SetEntsDefault.outside):
|
|
|
|
|
"""Set entity annotation.
|
|
|
|
|
|
|
|
|
|
entities (List[Span]): Spans with labels to set as entities.
|
|
|
|
|
blocked (Optional[List[Span]]): Spans to set as 'blocked' (never an
|
|
|
|
|
entity) for spacy's built-in NER component. Other components may
|
|
|
|
|
ignore this setting.
|
|
|
|
|
missing (Optional[List[Span]]): Spans with missing/unknown entity
|
|
|
|
|
information.
|
|
|
|
|
outside (Optional[List[Span]]): Spans outside of entities (O in IOB).
|
|
|
|
|
default (str): How to set entity annotation for tokens outside of any
|
|
|
|
|
provided spans. Options: "blocked", "missing", "outside" and
|
|
|
|
|
"unmodified" (preserve current state). Defaults to "outside".
|
2020-09-17 22:10:41 +03:00
|
|
|
|
"""
|
2020-09-21 16:54:05 +03:00
|
|
|
|
if default not in SetEntsDefault.values():
|
|
|
|
|
raise ValueError(Errors.E1011.format(default=default, modes=", ".join(SetEntsDefault)))
|
|
|
|
|
|
2020-09-24 13:36:51 +03:00
|
|
|
|
# Ignore spans with missing labels
|
|
|
|
|
entities = [ent for ent in entities if ent.label > 0]
|
|
|
|
|
|
2020-09-21 16:54:05 +03:00
|
|
|
|
if blocked is None:
|
|
|
|
|
blocked = tuple()
|
|
|
|
|
if missing is None:
|
|
|
|
|
missing = tuple()
|
|
|
|
|
if outside is None:
|
|
|
|
|
outside = tuple()
|
|
|
|
|
|
|
|
|
|
# Find all tokens covered by spans and check that none are overlapping
|
2020-09-24 13:36:51 +03:00
|
|
|
|
cdef int i
|
2020-09-21 16:54:05 +03:00
|
|
|
|
seen_tokens = set()
|
2020-09-22 15:42:51 +03:00
|
|
|
|
for span in itertools.chain.from_iterable([entities, blocked, missing, outside]):
|
2020-09-21 16:54:05 +03:00
|
|
|
|
if not isinstance(span, Span):
|
|
|
|
|
raise ValueError(Errors.E1012.format(span=span))
|
|
|
|
|
for i in range(span.start, span.end):
|
|
|
|
|
if i in seen_tokens:
|
|
|
|
|
raise ValueError(Errors.E1010.format(i=i))
|
|
|
|
|
seen_tokens.add(i)
|
|
|
|
|
|
|
|
|
|
# Set all specified entity information
|
|
|
|
|
for span in entities:
|
|
|
|
|
for i in range(span.start, span.end):
|
|
|
|
|
if i == span.start:
|
|
|
|
|
self.c[i].ent_iob = 3
|
|
|
|
|
else:
|
|
|
|
|
self.c[i].ent_iob = 1
|
|
|
|
|
self.c[i].ent_type = span.label
|
2020-09-24 13:36:51 +03:00
|
|
|
|
self.c[i].ent_kb_id = span.kb_id
|
2020-09-21 16:54:05 +03:00
|
|
|
|
for span in blocked:
|
2020-09-17 22:10:41 +03:00
|
|
|
|
for i in range(span.start, span.end):
|
|
|
|
|
self.c[i].ent_iob = 3
|
|
|
|
|
self.c[i].ent_type = 0
|
2020-09-21 16:54:05 +03:00
|
|
|
|
for span in missing:
|
|
|
|
|
for i in range(span.start, span.end):
|
|
|
|
|
self.c[i].ent_iob = 0
|
|
|
|
|
self.c[i].ent_type = 0
|
|
|
|
|
for span in outside:
|
|
|
|
|
for i in range(span.start, span.end):
|
|
|
|
|
self.c[i].ent_iob = 2
|
|
|
|
|
self.c[i].ent_type = 0
|
|
|
|
|
|
|
|
|
|
# Set tokens outside of all provided spans
|
|
|
|
|
if default != SetEntsDefault.unmodified:
|
|
|
|
|
for i in range(self.length):
|
|
|
|
|
if i not in seen_tokens:
|
|
|
|
|
self.c[i].ent_type = 0
|
|
|
|
|
if default == SetEntsDefault.outside:
|
|
|
|
|
self.c[i].ent_iob = 2
|
|
|
|
|
elif default == SetEntsDefault.missing:
|
|
|
|
|
self.c[i].ent_iob = 0
|
|
|
|
|
elif default == SetEntsDefault.blocked:
|
|
|
|
|
self.c[i].ent_iob = 3
|
|
|
|
|
|
|
|
|
|
# Fix any resulting inconsistent annotation
|
|
|
|
|
for i in range(self.length - 1):
|
|
|
|
|
# I must follow B or I: convert I to B
|
|
|
|
|
if (self.c[i].ent_iob == 0 or self.c[i].ent_iob == 2) and \
|
|
|
|
|
self.c[i+1].ent_iob == 1:
|
|
|
|
|
self.c[i+1].ent_iob = 3
|
|
|
|
|
# Change of type with BI or II: convert second I to B
|
|
|
|
|
if self.c[i].ent_type != self.c[i+1].ent_type and \
|
|
|
|
|
(self.c[i].ent_iob == 3 or self.c[i].ent_iob == 1) and \
|
|
|
|
|
self.c[i+1].ent_iob == 1:
|
|
|
|
|
self.c[i+1].ent_iob = 3
|
2020-09-17 22:10:41 +03:00
|
|
|
|
|
2019-03-11 17:59:09 +03:00
|
|
|
|
@property
|
|
|
|
|
def noun_chunks(self):
|
2017-05-18 23:17:09 +03:00
|
|
|
|
"""Iterate over the base noun phrases in the document. Yields base
|
2021-01-17 14:56:05 +03:00
|
|
|
|
noun-phrase #[code Span] objects, if the language has a noun chunk iterator.
|
|
|
|
|
Raises a NotImplementedError otherwise.
|
|
|
|
|
|
|
|
|
|
A base noun phrase, or "NP chunk", is a noun
|
2017-10-27 16:41:45 +03:00
|
|
|
|
phrase that does not permit other NPs to be nested within it – so no
|
|
|
|
|
NP-level coordination, no prepositional phrases, and no relative
|
|
|
|
|
clauses.
|
2017-05-18 23:17:09 +03:00
|
|
|
|
|
|
|
|
|
YIELDS (Span): Noun chunks in the document.
|
2019-03-08 13:42:26 +03:00
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
|
DOCS: https://spacy.io/api/doc#noun_chunks
|
2017-04-15 14:05:15 +03:00
|
|
|
|
"""
|
2021-01-17 14:56:05 +03:00
|
|
|
|
if self.noun_chunks_iterator is None:
|
|
|
|
|
raise NotImplementedError(Errors.E894.format(lang=self.vocab.lang))
|
2020-05-21 19:39:06 +03:00
|
|
|
|
|
2019-03-11 17:59:09 +03:00
|
|
|
|
# Accumulate the result before beginning to iterate over it. This
|
2021-01-17 14:56:05 +03:00
|
|
|
|
# prevents the tokenization from being changed out from under us
|
2019-03-11 17:59:09 +03:00
|
|
|
|
# during the iteration. The tricky thing here is that Span accepts
|
2021-01-17 14:56:05 +03:00
|
|
|
|
# its tokenization changing, so it's okay once we have the Span
|
2019-03-11 17:59:09 +03:00
|
|
|
|
# objects. See Issue #375.
|
|
|
|
|
spans = []
|
2021-01-17 14:56:05 +03:00
|
|
|
|
for start, end, label in self.noun_chunks_iterator(self):
|
|
|
|
|
spans.append(Span(self, start, end, label=label))
|
2019-03-11 17:59:09 +03:00
|
|
|
|
for span in spans:
|
|
|
|
|
yield span
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def sents(self):
|
2017-05-18 23:17:09 +03:00
|
|
|
|
"""Iterate over the sentences in the document. Yields sentence `Span`
|
2020-09-16 21:32:38 +03:00
|
|
|
|
objects. Sentence spans have no label.
|
2017-05-18 23:17:09 +03:00
|
|
|
|
|
2019-03-08 13:42:26 +03:00
|
|
|
|
YIELDS (Span): Sentences in the document.
|
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
|
DOCS: https://spacy.io/api/doc#sents
|
2015-07-13 20:58:26 +03:00
|
|
|
|
"""
|
2020-09-17 01:14:01 +03:00
|
|
|
|
if not self.has_annotation("SENT_START"):
|
2019-03-11 17:59:09 +03:00
|
|
|
|
raise ValueError(Errors.E030)
|
|
|
|
|
if "sents" in self.user_hooks:
|
|
|
|
|
yield from self.user_hooks["sents"](self)
|
|
|
|
|
else:
|
|
|
|
|
start = 0
|
|
|
|
|
for i in range(1, self.length):
|
|
|
|
|
if self.c[i].sent_start == 1:
|
|
|
|
|
yield Span(self, start, i)
|
|
|
|
|
start = i
|
|
|
|
|
if start != self.length:
|
|
|
|
|
yield Span(self, start, self.length)
|
2015-07-13 20:58:26 +03:00
|
|
|
|
|
2019-03-11 16:21:40 +03:00
|
|
|
|
@property
|
|
|
|
|
def lang(self):
|
|
|
|
|
"""RETURNS (uint64): ID of the language of the doc's vocabulary."""
|
|
|
|
|
return self.vocab.strings[self.vocab.lang]
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def lang_(self):
|
2020-05-24 18:20:58 +03:00
|
|
|
|
"""RETURNS (str): Language of the doc's vocabulary, e.g. 'en'."""
|
2019-03-11 16:21:40 +03:00
|
|
|
|
return self.vocab.lang
|
2015-07-13 20:58:26 +03:00
|
|
|
|
|
2015-07-13 22:46:02 +03:00
|
|
|
|
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
|
2015-07-13 20:58:26 +03:00
|
|
|
|
if self.length == self.max_length:
|
|
|
|
|
self._realloc(self.length * 2)
|
2015-11-03 16:15:14 +03:00
|
|
|
|
cdef TokenC* t = &self.c[self.length]
|
2015-08-28 03:02:33 +03:00
|
|
|
|
if LexemeOrToken is const_TokenC_ptr:
|
2015-07-13 20:58:26 +03:00
|
|
|
|
t[0] = lex_or_tok[0]
|
|
|
|
|
else:
|
|
|
|
|
t.lex = lex_or_tok
|
2015-07-13 22:46:02 +03:00
|
|
|
|
if self.length == 0:
|
|
|
|
|
t.idx = 0
|
|
|
|
|
else:
|
|
|
|
|
t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy
|
2015-09-09 04:39:46 +03:00
|
|
|
|
t.l_edge = self.length
|
|
|
|
|
t.r_edge = self.length
|
2018-04-03 16:50:31 +03:00
|
|
|
|
if t.lex.orth == 0:
|
|
|
|
|
raise ValueError(Errors.E031.format(i=self.length))
|
2015-07-13 22:46:02 +03:00
|
|
|
|
t.spacy = has_space
|
2015-07-13 20:58:26 +03:00
|
|
|
|
self.length += 1
|
2019-02-27 13:17:17 +03:00
|
|
|
|
if self.length == 1:
|
|
|
|
|
# Set token.sent_start to 1 for first token. See issue #2869
|
|
|
|
|
self.c[0].sent_start = 1
|
2015-07-13 22:46:02 +03:00
|
|
|
|
return t.idx + t.lex.length + t.spacy
|
2015-07-13 20:58:26 +03:00
|
|
|
|
|
|
|
|
|
@cython.boundscheck(False)
|
|
|
|
|
cpdef np.ndarray to_array(self, object py_attr_ids):
|
2017-10-19 17:07:14 +03:00
|
|
|
|
"""Export given token attributes to a numpy `ndarray`.
|
2017-10-27 16:41:45 +03:00
|
|
|
|
If `attr_ids` is a sequence of M attributes, the output array will be
|
|
|
|
|
of shape `(N, M)`, where N is the length of the `Doc` (in tokens). If
|
|
|
|
|
`attr_ids` is a single attribute, the output shape will be (N,). You
|
|
|
|
|
can specify attributes by integer ID (e.g. spacy.attrs.LEMMA) or
|
|
|
|
|
string name (e.g. 'LEMMA' or 'lemma').
|
2017-10-19 17:07:14 +03:00
|
|
|
|
|
🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)
* 🚨 Ignore all existing Mypy errors
* 🏗 Add Mypy check to CI
* Add types-mock and types-requests as dev requirements
* Add additional type ignore directives
* Add types packages to dev-only list in reqs test
* Add types-dataclasses for python 3.6
* Add ignore to pretrain
* 🏷 Improve type annotation on `run_command` helper
The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.
* 🔧 Allow variable type redefinition in limited contexts
These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:
```python
def process(items: List[str]) -> None:
# 'items' has type List[str]
items = [item.split() for item in items]
# 'items' now has type List[List[str]]
...
```
This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`
These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.
* 🏷 Add type annotation to converters mapping
* 🚨 Fix Mypy error in convert CLI argument verification
* 🏷 Improve type annotation on `resolve_dot_names` helper
* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`
* 🏷 Add type annotations for more `Vocab` attributes
* 🏷 Add loose type annotation for gold data compilation
* 🏷 Improve `_format_labels` type annotation
* 🏷 Fix `get_lang_class` type annotation
* 🏷 Loosen return type of `Language.evaluate`
* 🏷 Don't accept `Scorer` in `handle_scores_per_type`
* 🏷 Add `string_to_list` overloads
* 🏷 Fix non-Optional command-line options
* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`
* ➕ Install `typing_extensions` in Python 3.8+
The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.
Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.
These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.
* 🏷 Improve type annotation for `Language.pipe`
These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.
Fixes #8772
* ➖ Don't install `typing-extensions` in Python 3.8+
After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉
These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.
* resolve mypy errors for Strict pydantic types
* refactor code to avoid missing return statement
* fix types of convert CLI command
* avoid list-set confustion in debug_data
* fix typo and formatting
* small fixes to avoid type ignores
* fix types in profile CLI command and make it more efficient
* type fixes in projects CLI
* put one ignore back
* type fixes for render
* fix render types - the sequel
* fix BaseDefault in language definitions
* fix type of noun_chunks iterator - yields tuple instead of span
* fix types in language-specific modules
* 🏷 Expand accepted inputs of `get_string_id`
`get_string_id` accepts either a string (in which case it returns its
ID) or an ID (in which case it immediately returns the ID). These
changes extend the type annotation of `get_string_id` to indicate that
it can accept either strings or IDs.
* 🏷 Handle override types in `combine_score_weights`
The `combine_score_weights` function allows users to pass an `overrides`
mapping to override data extracted from the `weights` argument. Since it
allows `Optional` dictionary values, the return value may also include
`Optional` dictionary values.
These changes update the type annotations for `combine_score_weights` to
reflect this fact.
* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`
* 🏷 Fix redefinition of `wandb_logger`
These changes fix the redefinition of `wandb_logger` by giving a
separate name to each `WandbLogger` version. For
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3`
as `wandb_logger` for now.
* more fixes for typing in language
* type fixes in model definitions
* 🏷 Annotate `_RandomWords.probs` as `NDArray`
* 🏷 Annotate `tok2vec` layers to help Mypy
* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6
Also remove an import that I forgot to move to the top of the module 😅
* more fixes for matchers and other pipeline components
* quick fix for entity linker
* fixing types for spancat, textcat, etc
* bugfix for tok2vec
* type annotations for scorer
* add runtime_checkable for Protocol
* type and import fixes in tests
* mypy fixes for training utilities
* few fixes in util
* fix import
* 🐵 Remove unused `# type: ignore` directives
* 🏷 Annotate `Language._components`
* 🏷 Annotate `spacy.pipeline.Pipe`
* add doc as property to span.pyi
* small fixes and cleanup
* explicit type annotations instead of via comment
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
2021-10-14 16:21:40 +03:00
|
|
|
|
py_attr_ids (list[]): A list of attributes (int IDs or string names).
|
2017-05-18 23:17:09 +03:00
|
|
|
|
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
|
|
|
|
|
per word, and one column per attribute indicated in the input
|
|
|
|
|
`attr_ids`.
|
|
|
|
|
|
|
|
|
|
EXAMPLE:
|
|
|
|
|
>>> from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
|
|
|
|
|
>>> doc = nlp(text)
|
|
|
|
|
>>> # All strings mapped to integers, for easy export to numpy
|
|
|
|
|
>>> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
|
2015-07-13 20:58:26 +03:00
|
|
|
|
"""
|
|
|
|
|
cdef int i, j
|
|
|
|
|
cdef attr_id_t feature
|
2015-07-17 22:20:48 +03:00
|
|
|
|
cdef np.ndarray[attr_t, ndim=2] output
|
2017-10-19 17:07:14 +03:00
|
|
|
|
# Handle scalar/list inputs of strings/ints for py_attr_ids
|
2018-12-29 18:24:40 +03:00
|
|
|
|
# See also #3064
|
2019-12-22 03:53:56 +03:00
|
|
|
|
if isinstance(py_attr_ids, str):
|
2018-12-29 18:24:40 +03:00
|
|
|
|
# Handle inputs like doc.to_array('ORTH')
|
|
|
|
|
py_attr_ids = [py_attr_ids]
|
2019-03-08 13:42:26 +03:00
|
|
|
|
elif not hasattr(py_attr_ids, "__iter__"):
|
2018-12-29 18:24:40 +03:00
|
|
|
|
# Handle inputs like doc.to_array(ORTH)
|
2017-10-20 14:39:37 +03:00
|
|
|
|
py_attr_ids = [py_attr_ids]
|
|
|
|
|
# Allow strings, e.g. 'lemma' or 'LEMMA'
|
2020-06-26 20:34:12 +03:00
|
|
|
|
try:
|
|
|
|
|
py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
|
2017-10-20 14:39:37 +03:00
|
|
|
|
for id_ in py_attr_ids]
|
2020-06-26 20:34:12 +03:00
|
|
|
|
except KeyError as msg:
|
|
|
|
|
keys = [k for k in IDS.keys() if not k.startswith("FLAG")]
|
2020-08-06 00:53:21 +03:00
|
|
|
|
raise KeyError(Errors.E983.format(dict="IDS", key=msg, keys=keys)) from None
|
2017-10-27 18:07:26 +03:00
|
|
|
|
# Make an array from the attributes --- otherwise our inner loop is
|
|
|
|
|
# Python dict iteration.
|
2019-03-08 13:42:26 +03:00
|
|
|
|
cdef np.ndarray attr_ids = numpy.asarray(py_attr_ids, dtype="i")
|
|
|
|
|
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
|
2017-11-17 20:55:56 +03:00
|
|
|
|
c_output = <attr_t*>output.data
|
|
|
|
|
c_attr_ids = <attr_id_t*>attr_ids.data
|
|
|
|
|
cdef TokenC* token
|
|
|
|
|
cdef int nr_attr = attr_ids.shape[0]
|
2015-07-13 20:58:26 +03:00
|
|
|
|
for i in range(self.length):
|
2017-11-17 20:55:56 +03:00
|
|
|
|
token = &self.c[i]
|
|
|
|
|
for j in range(nr_attr):
|
|
|
|
|
c_output[i*nr_attr + j] = get_token_attr(token, c_attr_ids[j])
|
2017-10-20 14:39:37 +03:00
|
|
|
|
# Handle 1d case
|
|
|
|
|
return output if len(attr_ids) >= 2 else output.reshape((self.length,))
|
|
|
|
|
|
2019-07-11 14:05:53 +03:00
|
|
|
|
def count_by(self, attr_id_t attr_id, exclude=None, object counts=None):
|
2017-05-18 23:17:09 +03:00
|
|
|
|
"""Count the frequencies of a given attribute. Produces a dict of
|
|
|
|
|
`{attribute (int): count (ints)}` frequencies, keyed by the values of
|
|
|
|
|
the given attribute ID.
|
|
|
|
|
|
|
|
|
|
attr_id (int): The attribute ID to key the counts.
|
|
|
|
|
RETURNS (dict): A dictionary mapping attributes to integer counts.
|
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
|
DOCS: https://spacy.io/api/doc#count_by
|
2015-07-13 20:58:26 +03:00
|
|
|
|
"""
|
|
|
|
|
cdef int i
|
|
|
|
|
cdef attr_t attr
|
|
|
|
|
cdef size_t count
|
2017-02-27 00:27:11 +03:00
|
|
|
|
|
2015-07-14 04:20:09 +03:00
|
|
|
|
if counts is None:
|
2019-07-11 14:05:53 +03:00
|
|
|
|
counts = Counter()
|
2015-07-14 04:20:09 +03:00
|
|
|
|
output_dict = True
|
|
|
|
|
else:
|
|
|
|
|
output_dict = False
|
|
|
|
|
# Take this check out of the loop, for a bit of extra speed
|
|
|
|
|
if exclude is None:
|
|
|
|
|
for i in range(self.length):
|
2019-07-11 14:09:22 +03:00
|
|
|
|
counts[get_token_attr(&self.c[i], attr_id)] += 1
|
2015-07-14 04:20:09 +03:00
|
|
|
|
else:
|
|
|
|
|
for i in range(self.length):
|
|
|
|
|
if not exclude(self[i]):
|
2019-07-11 14:09:22 +03:00
|
|
|
|
counts[get_token_attr(&self.c[i], attr_id)] += 1
|
2015-07-14 04:20:09 +03:00
|
|
|
|
if output_dict:
|
|
|
|
|
return dict(counts)
|
2015-07-13 20:58:26 +03:00
|
|
|
|
|
|
|
|
|
def _realloc(self, new_size):
|
2020-06-26 20:34:12 +03:00
|
|
|
|
if new_size < self.max_length:
|
|
|
|
|
return
|
2015-07-13 20:58:26 +03:00
|
|
|
|
self.max_length = new_size
|
|
|
|
|
n = new_size + (PADDING * 2)
|
|
|
|
|
# What we're storing is a "padded" array. We've jumped forward PADDING
|
|
|
|
|
# places, and are storing the pointer to that. This way, we can access
|
|
|
|
|
# words out-of-bounds, and get out-of-bounds markers.
|
|
|
|
|
# Now that we want to realloc, we need the address of the true start,
|
|
|
|
|
# so we jump the pointer back PADDING places.
|
2015-11-03 16:15:14 +03:00
|
|
|
|
cdef TokenC* data_start = self.c - PADDING
|
2015-07-13 20:58:26 +03:00
|
|
|
|
data_start = <TokenC*>self.mem.realloc(data_start, n * sizeof(TokenC))
|
2015-11-03 16:15:14 +03:00
|
|
|
|
self.c = data_start + PADDING
|
2015-07-13 20:58:26 +03:00
|
|
|
|
cdef int i
|
|
|
|
|
for i in range(self.length, self.max_length + PADDING):
|
2015-11-03 16:15:14 +03:00
|
|
|
|
self.c[i].lex = &EMPTY_LEXEME
|
2015-07-13 20:58:26 +03:00
|
|
|
|
|
2016-05-05 13:11:57 +03:00
|
|
|
|
def from_array(self, attrs, array):
|
2019-03-08 13:42:26 +03:00
|
|
|
|
"""Load attributes from a numpy array. Write to a `Doc` object, from an
|
|
|
|
|
`(M, N)` array of attributes.
|
|
|
|
|
|
|
|
|
|
attrs (list) A list of attribute ID ints.
|
|
|
|
|
array (numpy.ndarray[ndim=2, dtype='int32']): The attribute values.
|
|
|
|
|
RETURNS (Doc): Itself.
|
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
|
DOCS: https://spacy.io/api/doc#from_array
|
2019-03-08 13:42:26 +03:00
|
|
|
|
"""
|
2019-03-10 17:50:48 +03:00
|
|
|
|
# Handle scalar/list inputs of strings/ints for py_attr_ids
|
|
|
|
|
# See also #3064
|
2019-12-22 03:53:56 +03:00
|
|
|
|
if isinstance(attrs, str):
|
2019-03-10 17:50:48 +03:00
|
|
|
|
# Handle inputs like doc.to_array('ORTH')
|
|
|
|
|
attrs = [attrs]
|
|
|
|
|
elif not hasattr(attrs, "__iter__"):
|
|
|
|
|
# Handle inputs like doc.to_array(ORTH)
|
|
|
|
|
attrs = [attrs]
|
|
|
|
|
# Allow strings, e.g. 'lemma' or 'LEMMA'
|
|
|
|
|
attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
|
2019-03-10 18:54:03 +03:00
|
|
|
|
for id_ in attrs]
|
2020-03-02 13:49:28 +03:00
|
|
|
|
if array.dtype != numpy.uint64:
|
2020-04-28 14:37:37 +03:00
|
|
|
|
warnings.warn(Warnings.W028.format(type=array.dtype))
|
2019-03-11 16:21:40 +03:00
|
|
|
|
|
2020-06-26 20:34:12 +03:00
|
|
|
|
cdef int i, col
|
|
|
|
|
cdef int32_t abs_head_index
|
2015-07-22 05:53:01 +03:00
|
|
|
|
cdef attr_id_t attr_id
|
2015-11-03 16:15:14 +03:00
|
|
|
|
cdef TokenC* tokens = self.c
|
2015-07-22 05:53:01 +03:00
|
|
|
|
cdef int length = len(array)
|
2020-06-26 20:34:12 +03:00
|
|
|
|
if length != len(self):
|
2020-06-29 15:33:00 +03:00
|
|
|
|
raise ValueError(Errors.E971.format(array_length=length, doc_length=len(self)))
|
2020-06-26 20:34:12 +03:00
|
|
|
|
|
2017-05-09 19:45:18 +03:00
|
|
|
|
# Get set up for fast loading
|
|
|
|
|
cdef Pool mem = Pool()
|
|
|
|
|
cdef int n_attrs = len(attrs)
|
2019-10-22 17:54:33 +03:00
|
|
|
|
# attrs should not be empty, but make sure to avoid zero-length mem alloc
|
|
|
|
|
assert n_attrs > 0
|
2017-05-09 19:45:18 +03:00
|
|
|
|
attr_ids = <attr_id_t*>mem.alloc(n_attrs, sizeof(attr_id_t))
|
|
|
|
|
for i, attr_id in enumerate(attrs):
|
|
|
|
|
attr_ids[i] = attr_id
|
2019-03-10 18:54:03 +03:00
|
|
|
|
if len(array.shape) == 1:
|
|
|
|
|
array = array.reshape((array.size, 1))
|
2020-06-26 20:34:12 +03:00
|
|
|
|
cdef np.ndarray transposed_array = numpy.ascontiguousarray(array.T)
|
|
|
|
|
values = <const uint64_t*>transposed_array.data
|
|
|
|
|
stride = transposed_array.shape[1]
|
2020-03-03 23:44:51 +03:00
|
|
|
|
# Check that all heads are within the document bounds
|
|
|
|
|
if HEAD in attrs:
|
|
|
|
|
col = attrs.index(HEAD)
|
|
|
|
|
for i in range(length):
|
|
|
|
|
# cast index to signed int
|
2020-06-26 20:34:12 +03:00
|
|
|
|
abs_head_index = <int32_t>values[col * stride + i]
|
|
|
|
|
abs_head_index += i
|
2021-01-12 19:17:06 +03:00
|
|
|
|
if abs_head_index < 0 or abs_head_index >= length:
|
2020-06-26 20:34:12 +03:00
|
|
|
|
raise ValueError(
|
|
|
|
|
Errors.E190.format(
|
|
|
|
|
index=i,
|
|
|
|
|
value=array[i, col],
|
|
|
|
|
rel_head_index=abs_head_index-i
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
# Verify ENT_IOB are proper integers
|
|
|
|
|
if ENT_IOB in attrs:
|
|
|
|
|
iob_strings = Token.iob_strings()
|
|
|
|
|
col = attrs.index(ENT_IOB)
|
|
|
|
|
n_iob_strings = len(iob_strings)
|
2019-03-11 03:31:21 +03:00
|
|
|
|
for i in range(length):
|
2020-06-26 20:34:12 +03:00
|
|
|
|
value = values[col * stride + i]
|
|
|
|
|
if value < 0 or value >= n_iob_strings:
|
|
|
|
|
raise ValueError(
|
|
|
|
|
Errors.E982.format(
|
|
|
|
|
values=iob_strings,
|
|
|
|
|
value=value
|
|
|
|
|
)
|
|
|
|
|
)
|
2017-05-09 19:45:18 +03:00
|
|
|
|
# Now load the data
|
2019-07-23 19:28:55 +03:00
|
|
|
|
for i in range(length):
|
2017-05-09 19:45:18 +03:00
|
|
|
|
token = &self.c[i]
|
|
|
|
|
for j in range(n_attrs):
|
2020-08-07 16:27:13 +03:00
|
|
|
|
value = values[j * stride + i]
|
|
|
|
|
if attr_ids[j] == MORPH:
|
|
|
|
|
# add morph to morphology table
|
|
|
|
|
self.vocab.morphology.add(self.vocab.strings[value])
|
|
|
|
|
Token.set_struct_attr(token, attr_ids[j], value)
|
2020-09-17 01:14:01 +03:00
|
|
|
|
# If document is parsed, set children and sentence boundaries
|
|
|
|
|
if HEAD in attrs and DEP in attrs:
|
|
|
|
|
col = attrs.index(DEP)
|
|
|
|
|
if array[:, col].any():
|
|
|
|
|
set_children_from_heads(self.c, 0, length)
|
2015-07-22 05:53:01 +03:00
|
|
|
|
return self
|
|
|
|
|
|
2020-07-03 12:32:42 +03:00
|
|
|
|
@staticmethod
|
|
|
|
|
def from_docs(docs, ensure_whitespace=True, attrs=None):
|
2020-09-17 01:14:01 +03:00
|
|
|
|
"""Concatenate multiple Doc objects to form a new one. Raises an error
|
|
|
|
|
if the `Doc` objects do not all share the same `Vocab`.
|
2020-07-03 12:32:42 +03:00
|
|
|
|
|
|
|
|
|
docs (list): A list of Doc objects.
|
|
|
|
|
ensure_whitespace (bool): Insert a space between two adjacent docs whenever the first doc does not end in whitespace.
|
|
|
|
|
attrs (list): Optional list of attribute ID ints or attribute name strings.
|
|
|
|
|
RETURNS (Doc): A doc that contains the concatenated docs, or None if no docs were given.
|
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
|
DOCS: https://spacy.io/api/doc#from_docs
|
2020-07-03 12:32:42 +03:00
|
|
|
|
"""
|
|
|
|
|
if not docs:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
vocab = {doc.vocab for doc in docs}
|
|
|
|
|
if len(vocab) > 1:
|
|
|
|
|
raise ValueError(Errors.E999)
|
|
|
|
|
(vocab,) = vocab
|
|
|
|
|
|
|
|
|
|
if attrs is None:
|
2021-01-07 08:42:12 +03:00
|
|
|
|
attrs = list(Doc._get_array_attrs())
|
2020-07-03 12:32:42 +03:00
|
|
|
|
else:
|
|
|
|
|
if any(isinstance(attr, str) for attr in attrs): # resolve attribute names
|
|
|
|
|
attrs = [intify_attr(attr) for attr in attrs] # intify_attr returns None for invalid attrs
|
|
|
|
|
attrs = list(attr for attr in set(attrs) if attr) # filter duplicates, remove None if present
|
|
|
|
|
if SPACY not in attrs:
|
|
|
|
|
attrs.append(SPACY)
|
|
|
|
|
|
|
|
|
|
concat_words = []
|
|
|
|
|
concat_spaces = []
|
|
|
|
|
concat_user_data = {}
|
2021-03-29 14:34:01 +03:00
|
|
|
|
concat_spans = defaultdict(list)
|
2020-07-03 12:32:42 +03:00
|
|
|
|
char_offset = 0
|
|
|
|
|
for doc in docs:
|
|
|
|
|
concat_words.extend(t.text for t in doc)
|
|
|
|
|
concat_spaces.extend(bool(t.whitespace_) for t in doc)
|
|
|
|
|
|
|
|
|
|
for key, value in doc.user_data.items():
|
2021-03-30 10:49:12 +03:00
|
|
|
|
if isinstance(key, tuple) and len(key) == 4 and key[0] == "._.":
|
2020-07-03 12:32:42 +03:00
|
|
|
|
data_type, name, start, end = key
|
|
|
|
|
if start is not None or end is not None:
|
|
|
|
|
start += char_offset
|
|
|
|
|
if end is not None:
|
|
|
|
|
end += char_offset
|
|
|
|
|
concat_user_data[(data_type, name, start, end)] = copy.copy(value)
|
|
|
|
|
else:
|
|
|
|
|
warnings.warn(Warnings.W101.format(name=name))
|
|
|
|
|
else:
|
|
|
|
|
warnings.warn(Warnings.W102.format(key=key, value=value))
|
2021-03-29 14:34:01 +03:00
|
|
|
|
for key in doc.spans:
|
2021-06-23 16:51:35 +03:00
|
|
|
|
# if a spans key is in any doc, include it in the merged doc
|
|
|
|
|
# even if it is empty
|
|
|
|
|
if key not in concat_spans:
|
|
|
|
|
concat_spans[key] = []
|
2021-03-29 14:34:01 +03:00
|
|
|
|
for span in doc.spans[key]:
|
|
|
|
|
concat_spans[key].append((
|
|
|
|
|
span.start_char + char_offset,
|
|
|
|
|
span.end_char + char_offset,
|
|
|
|
|
span.label,
|
|
|
|
|
span.kb_id,
|
|
|
|
|
span.text, # included as a check
|
|
|
|
|
))
|
2020-09-03 11:09:03 +03:00
|
|
|
|
char_offset += len(doc.text)
|
2021-06-23 16:51:35 +03:00
|
|
|
|
if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space and not bool(doc[-1].whitespace_):
|
2020-09-03 11:09:03 +03:00
|
|
|
|
char_offset += 1
|
2020-07-03 12:32:42 +03:00
|
|
|
|
|
|
|
|
|
arrays = [doc.to_array(attrs) for doc in docs]
|
|
|
|
|
|
|
|
|
|
if ensure_whitespace:
|
|
|
|
|
spacy_index = attrs.index(SPACY)
|
|
|
|
|
for i, array in enumerate(arrays[:-1]):
|
|
|
|
|
if len(array) > 0 and not docs[i][-1].is_space:
|
|
|
|
|
array[-1][spacy_index] = 1
|
2021-05-05 19:44:14 +03:00
|
|
|
|
if len(concat_spaces) > 0:
|
|
|
|
|
token_offset = -1
|
|
|
|
|
for doc in docs[:-1]:
|
|
|
|
|
token_offset += len(doc)
|
2022-01-18 19:12:42 +03:00
|
|
|
|
if len(doc) > 0 and not doc[-1].is_space:
|
2021-05-05 19:44:14 +03:00
|
|
|
|
concat_spaces[token_offset] = True
|
2020-07-03 12:32:42 +03:00
|
|
|
|
|
|
|
|
|
concat_array = numpy.concatenate(arrays)
|
|
|
|
|
|
|
|
|
|
concat_doc = Doc(vocab, words=concat_words, spaces=concat_spaces, user_data=concat_user_data)
|
|
|
|
|
|
|
|
|
|
concat_doc.from_array(attrs, concat_array)
|
|
|
|
|
|
2021-03-29 14:34:01 +03:00
|
|
|
|
for key in concat_spans:
|
|
|
|
|
if key not in concat_doc.spans:
|
|
|
|
|
concat_doc.spans[key] = []
|
|
|
|
|
for span_tuple in concat_spans[key]:
|
|
|
|
|
span = concat_doc.char_span(
|
|
|
|
|
span_tuple[0],
|
|
|
|
|
span_tuple[1],
|
|
|
|
|
label=span_tuple[2],
|
|
|
|
|
kb_id=span_tuple[3],
|
|
|
|
|
)
|
|
|
|
|
text = span_tuple[4]
|
|
|
|
|
if span is not None and span.text == text:
|
|
|
|
|
concat_doc.spans[key].append(span)
|
|
|
|
|
else:
|
|
|
|
|
raise ValueError(Errors.E873.format(key=key, text=text))
|
|
|
|
|
|
2020-07-03 12:32:42 +03:00
|
|
|
|
return concat_doc
|
|
|
|
|
|
2017-10-20 21:28:00 +03:00
|
|
|
|
def get_lca_matrix(self):
|
2018-12-29 20:02:26 +03:00
|
|
|
|
"""Calculates a matrix of Lowest Common Ancestors (LCA) for a given
|
|
|
|
|
`Doc`, where LCA[i, j] is the index of the lowest common ancestor among
|
|
|
|
|
token i and j.
|
|
|
|
|
|
|
|
|
|
RETURNS (np.array[ndim=2, dtype=numpy.int32]): LCA matrix with shape
|
|
|
|
|
(n, n), where n = len(self).
|
2019-03-08 13:42:26 +03:00
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
|
DOCS: https://spacy.io/api/doc#get_lca_matrix
|
2017-10-27 16:41:45 +03:00
|
|
|
|
"""
|
2018-12-29 20:02:26 +03:00
|
|
|
|
return numpy.asarray(_get_lca_matrix(self, 0, len(self)))
|
2017-10-20 21:28:00 +03:00
|
|
|
|
|
2020-06-26 20:34:12 +03:00
|
|
|
|
def copy(self):
|
|
|
|
|
cdef Doc other = Doc(self.vocab)
|
|
|
|
|
other._vector = copy.deepcopy(self._vector)
|
|
|
|
|
other._vector_norm = copy.deepcopy(self._vector_norm)
|
|
|
|
|
other.tensor = copy.deepcopy(self.tensor)
|
|
|
|
|
other.cats = copy.deepcopy(self.cats)
|
|
|
|
|
other.user_data = copy.deepcopy(self.user_data)
|
|
|
|
|
other.sentiment = self.sentiment
|
2020-07-05 00:52:02 +03:00
|
|
|
|
other.has_unknown_spaces = self.has_unknown_spaces
|
2020-06-26 20:34:12 +03:00
|
|
|
|
other.user_hooks = dict(self.user_hooks)
|
|
|
|
|
other.user_token_hooks = dict(self.user_token_hooks)
|
|
|
|
|
other.user_span_hooks = dict(self.user_span_hooks)
|
|
|
|
|
other.length = self.length
|
|
|
|
|
other.max_length = self.max_length
|
2021-02-28 04:32:48 +03:00
|
|
|
|
other.spans = self.spans.copy(doc=other)
|
2020-06-26 20:34:12 +03:00
|
|
|
|
buff_size = other.max_length + (PADDING*2)
|
2021-01-06 04:50:17 +03:00
|
|
|
|
assert buff_size > 0
|
2020-06-26 20:34:12 +03:00
|
|
|
|
tokens = <TokenC*>other.mem.alloc(buff_size, sizeof(TokenC))
|
|
|
|
|
memcpy(tokens, self.c - PADDING, buff_size * sizeof(TokenC))
|
|
|
|
|
other.c = &tokens[PADDING]
|
|
|
|
|
return other
|
|
|
|
|
|
2020-07-29 16:14:07 +03:00
|
|
|
|
def to_disk(self, path, *, exclude=tuple()):
|
2017-05-24 12:58:17 +03:00
|
|
|
|
"""Save the current state to a directory.
|
|
|
|
|
|
2020-05-24 19:51:10 +03:00
|
|
|
|
path (str / Path): A path to a directory, which will be created if
|
2017-10-27 18:07:26 +03:00
|
|
|
|
it doesn't exist. Paths may be either strings or Path-like objects.
|
2020-07-29 16:14:07 +03:00
|
|
|
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
2019-03-08 13:42:26 +03:00
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
|
DOCS: https://spacy.io/api/doc#to_disk
|
2017-05-24 12:58:17 +03:00
|
|
|
|
"""
|
2017-11-09 04:29:03 +03:00
|
|
|
|
path = util.ensure_path(path)
|
2019-03-08 13:42:26 +03:00
|
|
|
|
with path.open("wb") as file_:
|
2020-07-29 16:14:07 +03:00
|
|
|
|
file_.write(self.to_bytes(exclude=exclude))
|
2017-05-24 12:58:17 +03:00
|
|
|
|
|
2020-07-29 16:14:07 +03:00
|
|
|
|
def from_disk(self, path, *, exclude=tuple()):
|
2017-05-24 12:58:17 +03:00
|
|
|
|
"""Loads state from a directory. Modifies the object in place and
|
|
|
|
|
returns it.
|
|
|
|
|
|
2020-05-24 19:51:10 +03:00
|
|
|
|
path (str / Path): A path to a directory. Paths may be either
|
2017-05-24 12:58:17 +03:00
|
|
|
|
strings or `Path`-like objects.
|
2019-03-10 21:16:45 +03:00
|
|
|
|
exclude (list): String names of serialization fields to exclude.
|
2017-05-24 12:58:17 +03:00
|
|
|
|
RETURNS (Doc): The modified `Doc` object.
|
2019-03-08 13:42:26 +03:00
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
|
DOCS: https://spacy.io/api/doc#from_disk
|
2017-05-24 12:58:17 +03:00
|
|
|
|
"""
|
2017-11-09 04:29:03 +03:00
|
|
|
|
path = util.ensure_path(path)
|
2019-03-08 13:42:26 +03:00
|
|
|
|
with path.open("rb") as file_:
|
2017-05-31 00:35:17 +03:00
|
|
|
|
bytes_data = file_.read()
|
2020-07-29 16:14:07 +03:00
|
|
|
|
return self.from_bytes(bytes_data, exclude=exclude)
|
2017-05-24 12:58:17 +03:00
|
|
|
|
|
2020-07-29 16:14:07 +03:00
|
|
|
|
def to_bytes(self, *, exclude=tuple()):
|
2017-05-18 23:17:09 +03:00
|
|
|
|
"""Serialize, i.e. export the document contents to a binary string.
|
|
|
|
|
|
2019-03-10 21:16:45 +03:00
|
|
|
|
exclude (list): String names of serialization fields to exclude.
|
2017-05-18 23:17:09 +03:00
|
|
|
|
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
|
|
|
|
|
all annotations.
|
2019-03-08 13:42:26 +03:00
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
|
DOCS: https://spacy.io/api/doc#to_bytes
|
2020-06-26 20:34:12 +03:00
|
|
|
|
"""
|
2020-07-29 16:14:07 +03:00
|
|
|
|
return srsly.msgpack_dumps(self.to_dict(exclude=exclude))
|
2020-06-26 20:34:12 +03:00
|
|
|
|
|
2020-07-29 16:14:07 +03:00
|
|
|
|
def from_bytes(self, bytes_data, *, exclude=tuple()):
|
2020-06-26 20:34:12 +03:00
|
|
|
|
"""Deserialize, i.e. import the document contents from a binary string.
|
|
|
|
|
|
|
|
|
|
data (bytes): The string to load from.
|
|
|
|
|
exclude (list): String names of serialization fields to exclude.
|
|
|
|
|
RETURNS (Doc): Itself.
|
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
|
DOCS: https://spacy.io/api/doc#from_bytes
|
2020-06-26 20:34:12 +03:00
|
|
|
|
"""
|
2020-07-29 16:14:07 +03:00
|
|
|
|
return self.from_dict(srsly.msgpack_loads(bytes_data), exclude=exclude)
|
2020-06-26 20:34:12 +03:00
|
|
|
|
|
2020-07-29 16:14:07 +03:00
|
|
|
|
def to_dict(self, *, exclude=tuple()):
|
2020-06-26 20:34:12 +03:00
|
|
|
|
"""Export the document contents to a dictionary for serialization.
|
|
|
|
|
|
2019-03-10 21:16:45 +03:00
|
|
|
|
exclude (list): String names of serialization fields to exclude.
|
2017-05-18 23:17:09 +03:00
|
|
|
|
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
|
|
|
|
|
all annotations.
|
2019-03-08 13:42:26 +03:00
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
|
DOCS: https://spacy.io/api/doc#to_bytes
|
2017-04-15 14:05:15 +03:00
|
|
|
|
"""
|
2020-09-17 01:14:01 +03:00
|
|
|
|
array_head = Doc._get_array_attrs()
|
2020-07-02 18:11:57 +03:00
|
|
|
|
strings = set()
|
|
|
|
|
for token in self:
|
|
|
|
|
strings.add(token.tag_)
|
|
|
|
|
strings.add(token.lemma_)
|
2020-10-01 23:21:46 +03:00
|
|
|
|
strings.add(str(token.morph))
|
2020-07-02 18:11:57 +03:00
|
|
|
|
strings.add(token.dep_)
|
|
|
|
|
strings.add(token.ent_type_)
|
|
|
|
|
strings.add(token.ent_kb_id_)
|
2020-09-17 01:14:01 +03:00
|
|
|
|
strings.add(token.ent_id_)
|
2020-07-02 18:11:57 +03:00
|
|
|
|
strings.add(token.norm_)
|
2021-01-14 09:30:41 +03:00
|
|
|
|
for group in self.spans.values():
|
|
|
|
|
for span in group:
|
|
|
|
|
strings.add(span.label_)
|
2017-10-17 20:29:20 +03:00
|
|
|
|
# Msgpack doesn't distinguish between lists and tuples, which is
|
|
|
|
|
# vexing for user data. As a best guess, we *know* that within
|
|
|
|
|
# keys, we must have tuples. In values we just have to hope
|
|
|
|
|
# users don't mind getting a list instead of a tuple.
|
2017-05-31 00:35:17 +03:00
|
|
|
|
serializers = {
|
2019-03-08 13:42:26 +03:00
|
|
|
|
"text": lambda: self.text,
|
|
|
|
|
"array_head": lambda: array_head,
|
|
|
|
|
"array_body": lambda: self.to_array(array_head),
|
|
|
|
|
"sentiment": lambda: self.sentiment,
|
|
|
|
|
"tensor": lambda: self.tensor,
|
2019-12-06 16:07:39 +03:00
|
|
|
|
"cats": lambda: self.cats,
|
2021-01-14 09:30:41 +03:00
|
|
|
|
"spans": lambda: self.spans.to_bytes(),
|
2020-07-02 18:11:57 +03:00
|
|
|
|
"strings": lambda: list(strings),
|
2020-07-03 13:58:16 +03:00
|
|
|
|
"has_unknown_spaces": lambda: self.has_unknown_spaces
|
2017-05-31 00:35:17 +03:00
|
|
|
|
}
|
2019-03-08 13:42:26 +03:00
|
|
|
|
if "user_data" not in exclude and self.user_data:
|
2017-10-17 20:29:20 +03:00
|
|
|
|
user_data_keys, user_data_values = list(zip(*self.user_data.items()))
|
2019-03-10 21:16:45 +03:00
|
|
|
|
if "user_data_keys" not in exclude:
|
|
|
|
|
serializers["user_data_keys"] = lambda: srsly.msgpack_dumps(user_data_keys)
|
|
|
|
|
if "user_data_values" not in exclude:
|
|
|
|
|
serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values)
|
2020-12-29 13:54:32 +03:00
|
|
|
|
if "user_hooks" not in exclude and any((self.user_hooks, self.user_token_hooks, self.user_span_hooks)):
|
2021-06-04 18:44:04 +03:00
|
|
|
|
warnings.warn(Warnings.W109)
|
2020-06-26 20:34:12 +03:00
|
|
|
|
return util.to_dict(serializers, exclude)
|
2017-05-31 00:35:17 +03:00
|
|
|
|
|
2020-07-29 16:14:07 +03:00
|
|
|
|
def from_dict(self, msg, *, exclude=tuple()):
|
2017-05-18 23:17:09 +03:00
|
|
|
|
"""Deserialize, i.e. import the document contents from a binary string.
|
|
|
|
|
|
|
|
|
|
data (bytes): The string to load from.
|
2019-03-10 21:16:45 +03:00
|
|
|
|
exclude (list): String names of serialization fields to exclude.
|
2017-05-18 23:17:09 +03:00
|
|
|
|
RETURNS (Doc): Itself.
|
2019-03-08 13:42:26 +03:00
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
|
DOCS: https://spacy.io/api/doc#from_dict
|
2017-04-15 14:05:15 +03:00
|
|
|
|
"""
|
2017-05-09 19:11:34 +03:00
|
|
|
|
if self.length != 0:
|
2018-04-03 16:50:31 +03:00
|
|
|
|
raise ValueError(Errors.E033.format(length=self.length))
|
2017-10-17 20:29:20 +03:00
|
|
|
|
# Msgpack doesn't distinguish between lists and tuples, which is
|
|
|
|
|
# vexing for user data. As a best guess, we *know* that within
|
|
|
|
|
# keys, we must have tuples. In values we just have to hope
|
|
|
|
|
# users don't mind getting a list instead of a tuple.
|
2019-03-08 13:42:26 +03:00
|
|
|
|
if "user_data" not in exclude and "user_data_keys" in msg:
|
|
|
|
|
user_data_keys = srsly.msgpack_loads(msg["user_data_keys"], use_list=False)
|
|
|
|
|
user_data_values = srsly.msgpack_loads(msg["user_data_values"])
|
2017-10-17 20:29:20 +03:00
|
|
|
|
for key, value in zip(user_data_keys, user_data_values):
|
|
|
|
|
self.user_data[key] = value
|
2017-05-09 19:11:34 +03:00
|
|
|
|
cdef int i, start, end, has_space
|
2019-03-08 13:42:26 +03:00
|
|
|
|
if "sentiment" not in exclude and "sentiment" in msg:
|
|
|
|
|
self.sentiment = msg["sentiment"]
|
|
|
|
|
if "tensor" not in exclude and "tensor" in msg:
|
|
|
|
|
self.tensor = msg["tensor"]
|
2019-12-06 16:07:39 +03:00
|
|
|
|
if "cats" not in exclude and "cats" in msg:
|
|
|
|
|
self.cats = msg["cats"]
|
2020-07-02 18:11:57 +03:00
|
|
|
|
if "strings" not in exclude and "strings" in msg:
|
|
|
|
|
for s in msg["strings"]:
|
|
|
|
|
self.vocab.strings.add(s)
|
2020-07-03 13:58:16 +03:00
|
|
|
|
if "has_unknown_spaces" not in exclude and "has_unknown_spaces" in msg:
|
|
|
|
|
self.has_unknown_spaces = msg["has_unknown_spaces"]
|
2017-05-09 19:11:34 +03:00
|
|
|
|
start = 0
|
|
|
|
|
cdef const LexemeC* lex
|
2021-09-13 18:02:17 +03:00
|
|
|
|
cdef str orth_
|
2019-03-08 13:42:26 +03:00
|
|
|
|
text = msg["text"]
|
|
|
|
|
attrs = msg["array_body"]
|
2017-05-09 19:11:34 +03:00
|
|
|
|
for i in range(attrs.shape[0]):
|
|
|
|
|
end = start + attrs[i, 0]
|
|
|
|
|
has_space = attrs[i, 1]
|
|
|
|
|
orth_ = text[start:end]
|
|
|
|
|
lex = self.vocab.get(self.mem, orth_)
|
|
|
|
|
self.push_back(lex, has_space)
|
|
|
|
|
start = end + has_space
|
2019-03-08 13:42:26 +03:00
|
|
|
|
self.from_array(msg["array_head"][2:], attrs[:, 2:])
|
2021-01-14 09:30:41 +03:00
|
|
|
|
if "spans" in msg:
|
|
|
|
|
self.spans.from_bytes(msg["spans"])
|
|
|
|
|
else:
|
|
|
|
|
self.spans.clear()
|
2017-05-09 19:45:18 +03:00
|
|
|
|
return self
|
2015-07-22 05:53:01 +03:00
|
|
|
|
|
2017-11-03 13:20:31 +03:00
|
|
|
|
def extend_tensor(self, tensor):
|
2019-03-08 13:42:26 +03:00
|
|
|
|
"""Concatenate a new tensor onto the doc.tensor object.
|
2017-11-03 13:20:31 +03:00
|
|
|
|
|
|
|
|
|
The doc.tensor attribute holds dense feature vectors
|
|
|
|
|
computed by the models in the pipeline. Let's say a
|
|
|
|
|
document with 30 words has a tensor with 128 dimensions
|
|
|
|
|
per word. doc.tensor.shape will be (30, 128). After
|
2018-04-19 00:55:26 +03:00
|
|
|
|
calling doc.extend_tensor with an array of shape (30, 64),
|
2017-11-03 13:20:31 +03:00
|
|
|
|
doc.tensor == (30, 192).
|
2019-03-08 13:42:26 +03:00
|
|
|
|
"""
|
2017-11-03 13:20:31 +03:00
|
|
|
|
xp = get_array_module(self.tensor)
|
|
|
|
|
if self.tensor.size == 0:
|
💫 Port master changes over to develop (#2979)
* Create aryaprabhudesai.md (#2681)
* Update _install.jade (#2688)
Typo fix: "models" -> "model"
* Add FAC to spacy.explain (resolves #2706)
* Remove docstrings for deprecated arguments (see #2703)
* When calling getoption() in conftest.py, pass a default option (#2709)
* When calling getoption() in conftest.py, pass a default option
This is necessary to allow testing an installed spacy by running:
pytest --pyargs spacy
* Add contributor agreement
* update bengali token rules for hyphen and digits (#2731)
* Less norm computations in token similarity (#2730)
* Less norm computations in token similarity
* Contributor agreement
* Remove ')' for clarity (#2737)
Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.
* added contributor agreement for mbkupfer (#2738)
* Basic support for Telugu language (#2751)
* Lex _attrs for polish language (#2750)
* Signed spaCy contributor agreement
* Added polish version of english lex_attrs
* Introduces a bulk merge function, in order to solve issue #653 (#2696)
* Fix comment
* Introduce bulk merge to increase performance on many span merges
* Sign contributor agreement
* Implement pull request suggestions
* Describe converters more explicitly (see #2643)
* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]
* Fix formatting
* Fix dependency scheme docs (closes #2705) [ci skip]
* Don't set stop word in example (closes #2657) [ci skip]
* Add words to portuguese language _num_words (#2759)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Update Indonesian model (#2752)
* adding e-KTP in tokenizer exceptions list
* add exception token
* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception
* add tokenizer exceptions list
* combining base_norms with norm_exceptions
* adding norm_exception
* fix double key in lemmatizer
* remove unused import on punctuation.py
* reformat stop_words to reduce number of lines, improve readibility
* updating tokenizer exception
* implement is_currency for lang/id
* adding orth_first_upper in tokenizer_exceptions
* update the norm_exception list
* remove bunch of abbreviations
* adding contributors file
* Fixed spaCy+Keras example (#2763)
* bug fixes in keras example
* created contributor agreement
* Adding French hyphenated first name (#2786)
* Fix typo (closes #2784)
* Fix typo (#2795) [ci skip]
Fixed typo on line 6 "regcognizer --> recognizer"
* Adding basic support for Sinhala language. (#2788)
* adding Sinhala language package, stop words, examples and lex_attrs.
* Adding contributor agreement
* Updating contributor agreement
* Also include lowercase norm exceptions
* Fix error (#2802)
* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way. Use the resize function
* added spaCy Contributor Agreement
* Add charlax's contributor agreement (#2805)
* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)
* Contributors agreement
* Contributors agreement
* Contributors agreement
* Add jupyter=True to displacy.render in documentation (#2806)
* Revert "Also include lowercase norm exceptions"
This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.
* Remove deprecated encoding argument to msgpack
* Set up dependency tree pattern matching skeleton (#2732)
* Fix bug when too many entity types. Fixes #2800
* Fix Python 2 test failure
* Require older msgpack-numpy
* Restore encoding arg on msgpack-numpy
* Try to fix version pin for msgpack-numpy
* Update Portuguese Language (#2790)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols
* Extended punctuation and norm_exceptions in the Portuguese language
* Correct error in spacy universe docs concerning spacy-lookup (#2814)
* Update Keras Example for (Parikh et al, 2016) implementation (#2803)
* bug fixes in keras example
* created contributor agreement
* baseline for Parikh model
* initial version of parikh 2016 implemented
* tested asymmetric models
* fixed grevious error in normalization
* use standard SNLI test file
* begin to rework parikh example
* initial version of running example
* start to document the new version
* start to document the new version
* Update Decompositional Attention.ipynb
* fixed calls to similarity
* updated the README
* import sys package duh
* simplified indexing on mapping word to IDs
* stupid python indent error
* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround
* Fix typo (closes #2815) [ci skip]
* Update regex version dependency
* Set version to 2.0.13.dev3
* Skip seemingly problematic test
* Remove problematic test
* Try previous version of regex
* Revert "Remove problematic test"
This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.
* Unskip test
* Try older version of regex
* 💫 Update training examples and use minibatching (#2830)
<!--- Provide a general summary of your changes in the title. -->
## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.
### Types of change
enhancements
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Visual C++ link updated (#2842) (closes #2841) [ci skip]
* New landing page
* Add contribution agreement
* Correcting lang/ru/examples.py (#2845)
* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement
* Correct some grammatical inaccuracies in lang\ru\examples.py
* Move contributor agreement to separate file
* Set version to 2.0.13.dev4
* Add Persian(Farsi) language support (#2797)
* Also include lowercase norm exceptions
* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors
* Rule-based French Lemmatizer (#2818)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Set version to 2.0.13
* Fix formatting and consistency
* Update docs for new version [ci skip]
* Increment version [ci skip]
* Add info on wheels [ci skip]
* Adding "This is a sentence" example to Sinhala (#2846)
* Add wheels badge
* Update badge [ci skip]
* Update README.rst [ci skip]
* Update murmurhash pin
* Increment version to 2.0.14.dev0
* Update GPU docs for v2.0.14
* Add wheel to setup_requires
* Import prefer_gpu and require_gpu functions from Thinc
* Add tests for prefer_gpu() and require_gpu()
* Update requirements and setup.py
* Workaround bug in thinc require_gpu
* Set version to v2.0.14
* Update push-tag script
* Unhack prefer_gpu
* Require thinc 6.10.6
* Update prefer_gpu and require_gpu docs [ci skip]
* Fix specifiers for GPU
* Set version to 2.0.14.dev1
* Set version to 2.0.14
* Update Thinc version pin
* Increment version
* Fix msgpack-numpy version pin
* Increment version
* Update version to 2.0.16
* Update version [ci skip]
* Redundant ')' in the Stop words' example (#2856)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Documentation improvement regarding joblib and SO (#2867)
Some documentation improvements
## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)
### Types of change
Documentation
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* raise error when setting overlapping entities as doc.ents (#2880)
* Fix out-of-bounds access in NER training
The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!
This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.
* Change PyThaiNLP Url (#2876)
* Fix missing comma
* Add example showing a fix-up rule for space entities
* Set version to 2.0.17.dev0
* Update regex version
* Revert "Update regex version"
This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.
* Try setting older regex version, to align with conda
* Set version to 2.0.17
* Add spacy-js to universe [ci-skip]
* Add spacy-raspberry to universe (closes #2889)
* Add script to validate universe json [ci skip]
* Removed space in docs + added contributor indo (#2909)
* - removed unneeded space in documentation
* - added contributor info
* Allow input text of length up to max_length, inclusive (#2922)
* Include universe spec for spacy-wordnet component (#2919)
* feat: include universe spec for spacy-wordnet component
* chore: include spaCy contributor agreement
* Minor formatting changes [ci skip]
* Fix image [ci skip]
Twitter URL doesn't work on live site
* Check if the word is in one of the regular lists specific to each POS (#2886)
* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)
Resolves #2924.
## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)
### Types of change
bug fix
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix typo [ci skip]
* fixes symbolic link on py3 and windows (#2949)
* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948
* Update spacy/compat.py
Co-Authored-By: cicorias <cicorias@users.noreply.github.com>
* Fix formatting
* Update universe [ci skip]
* Catalan Language Support (#2940)
* Catalan language Support
* Ddding Catalan to documentation
* Sort languages alphabetically [ci skip]
* Update tests for pytest 4.x (#2965)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix regex pin to harmonize with conda (#2964)
* Update README.rst
* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)
Fixes #2976
* Fix typo
* Fix typo
* Remove duplicate file
* Require thinc 7.0.0.dev2
Fixes bug in gpu_ops that would use cupy instead of numpy on CPU
* Add missing import
* Fix error IDs
* Fix tests
2018-11-29 18:30:29 +03:00
|
|
|
|
self.tensor.resize(tensor.shape, refcheck=False)
|
2017-11-03 13:20:31 +03:00
|
|
|
|
copy_array(self.tensor, tensor)
|
|
|
|
|
else:
|
|
|
|
|
self.tensor = xp.hstack((self.tensor, tensor))
|
|
|
|
|
|
Add doc.retokenize() context manager (#2172)
This patch takes a step towards #1487 by introducing the
doc.retokenize() context manager, to handle merging spans, and soon
splitting tokens.
The idea is to do merging and splitting like this:
with doc.retokenize() as retokenizer:
for start, end, label in matches:
retokenizer.merge(doc[start : end], attrs={'ent_type': label})
The retokenizer accumulates the merge requests, and applies them
together at the end of the block. This will allow retokenization to be
more efficient, and much less error prone.
A retokenizer.split() function will then be added, to handle splitting a
single token into multiple tokens. These methods take `Span` and `Token`
objects; if the user wants to go directly from offsets, they can append
to the .merges and .splits lists on the retokenizer.
The doc.merge() method's behaviour remains unchanged, so this patch
should be 100% backwards incompatible (modulo bugs). Internally,
doc.merge() fixes up the arguments (to handle the various deprecated styles),
opens the retokenizer, and makes the single merge.
We can later start making deprecation warnings on direct calls to doc.merge(),
to migrate people to use of the retokenize context manager.
2018-04-03 15:10:35 +03:00
|
|
|
|
def retokenize(self):
|
2019-03-08 13:42:26 +03:00
|
|
|
|
"""Context manager to handle retokenization of the Doc.
|
Add doc.retokenize() context manager (#2172)
This patch takes a step towards #1487 by introducing the
doc.retokenize() context manager, to handle merging spans, and soon
splitting tokens.
The idea is to do merging and splitting like this:
with doc.retokenize() as retokenizer:
for start, end, label in matches:
retokenizer.merge(doc[start : end], attrs={'ent_type': label})
The retokenizer accumulates the merge requests, and applies them
together at the end of the block. This will allow retokenization to be
more efficient, and much less error prone.
A retokenizer.split() function will then be added, to handle splitting a
single token into multiple tokens. These methods take `Span` and `Token`
objects; if the user wants to go directly from offsets, they can append
to the .merges and .splits lists on the retokenizer.
The doc.merge() method's behaviour remains unchanged, so this patch
should be 100% backwards incompatible (modulo bugs). Internally,
doc.merge() fixes up the arguments (to handle the various deprecated styles),
opens the retokenizer, and makes the single merge.
We can later start making deprecation warnings on direct calls to doc.merge(),
to migrate people to use of the retokenize context manager.
2018-04-03 15:10:35 +03:00
|
|
|
|
Modifications to the Doc's tokenization are stored, and then
|
|
|
|
|
made all at once when the context manager exits. This is
|
|
|
|
|
much more efficient, and less error-prone.
|
|
|
|
|
|
|
|
|
|
All views of the Doc (Span and Token) created before the
|
|
|
|
|
retokenization are invalidated, although they may accidentally
|
|
|
|
|
continue to work.
|
2019-03-08 13:42:26 +03:00
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
|
DOCS: https://spacy.io/api/doc#retokenize
|
|
|
|
|
USAGE: https://spacy.io/usage/linguistic-features#retokenization
|
2019-03-08 13:42:26 +03:00
|
|
|
|
"""
|
Add doc.retokenize() context manager (#2172)
This patch takes a step towards #1487 by introducing the
doc.retokenize() context manager, to handle merging spans, and soon
splitting tokens.
The idea is to do merging and splitting like this:
with doc.retokenize() as retokenizer:
for start, end, label in matches:
retokenizer.merge(doc[start : end], attrs={'ent_type': label})
The retokenizer accumulates the merge requests, and applies them
together at the end of the block. This will allow retokenization to be
more efficient, and much less error prone.
A retokenizer.split() function will then be added, to handle splitting a
single token into multiple tokens. These methods take `Span` and `Token`
objects; if the user wants to go directly from offsets, they can append
to the .merges and .splits lists on the retokenizer.
The doc.merge() method's behaviour remains unchanged, so this patch
should be 100% backwards incompatible (modulo bugs). Internally,
doc.merge() fixes up the arguments (to handle the various deprecated styles),
opens the retokenizer, and makes the single merge.
We can later start making deprecation warnings on direct calls to doc.merge(),
to migrate people to use of the retokenize context manager.
2018-04-03 15:10:35 +03:00
|
|
|
|
return Retokenizer(self)
|
|
|
|
|
|
💫 Port master changes over to develop (#2979)
* Create aryaprabhudesai.md (#2681)
* Update _install.jade (#2688)
Typo fix: "models" -> "model"
* Add FAC to spacy.explain (resolves #2706)
* Remove docstrings for deprecated arguments (see #2703)
* When calling getoption() in conftest.py, pass a default option (#2709)
* When calling getoption() in conftest.py, pass a default option
This is necessary to allow testing an installed spacy by running:
pytest --pyargs spacy
* Add contributor agreement
* update bengali token rules for hyphen and digits (#2731)
* Less norm computations in token similarity (#2730)
* Less norm computations in token similarity
* Contributor agreement
* Remove ')' for clarity (#2737)
Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.
* added contributor agreement for mbkupfer (#2738)
* Basic support for Telugu language (#2751)
* Lex _attrs for polish language (#2750)
* Signed spaCy contributor agreement
* Added polish version of english lex_attrs
* Introduces a bulk merge function, in order to solve issue #653 (#2696)
* Fix comment
* Introduce bulk merge to increase performance on many span merges
* Sign contributor agreement
* Implement pull request suggestions
* Describe converters more explicitly (see #2643)
* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]
* Fix formatting
* Fix dependency scheme docs (closes #2705) [ci skip]
* Don't set stop word in example (closes #2657) [ci skip]
* Add words to portuguese language _num_words (#2759)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Update Indonesian model (#2752)
* adding e-KTP in tokenizer exceptions list
* add exception token
* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception
* add tokenizer exceptions list
* combining base_norms with norm_exceptions
* adding norm_exception
* fix double key in lemmatizer
* remove unused import on punctuation.py
* reformat stop_words to reduce number of lines, improve readibility
* updating tokenizer exception
* implement is_currency for lang/id
* adding orth_first_upper in tokenizer_exceptions
* update the norm_exception list
* remove bunch of abbreviations
* adding contributors file
* Fixed spaCy+Keras example (#2763)
* bug fixes in keras example
* created contributor agreement
* Adding French hyphenated first name (#2786)
* Fix typo (closes #2784)
* Fix typo (#2795) [ci skip]
Fixed typo on line 6 "regcognizer --> recognizer"
* Adding basic support for Sinhala language. (#2788)
* adding Sinhala language package, stop words, examples and lex_attrs.
* Adding contributor agreement
* Updating contributor agreement
* Also include lowercase norm exceptions
* Fix error (#2802)
* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way. Use the resize function
* added spaCy Contributor Agreement
* Add charlax's contributor agreement (#2805)
* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)
* Contributors agreement
* Contributors agreement
* Contributors agreement
* Add jupyter=True to displacy.render in documentation (#2806)
* Revert "Also include lowercase norm exceptions"
This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.
* Remove deprecated encoding argument to msgpack
* Set up dependency tree pattern matching skeleton (#2732)
* Fix bug when too many entity types. Fixes #2800
* Fix Python 2 test failure
* Require older msgpack-numpy
* Restore encoding arg on msgpack-numpy
* Try to fix version pin for msgpack-numpy
* Update Portuguese Language (#2790)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols
* Extended punctuation and norm_exceptions in the Portuguese language
* Correct error in spacy universe docs concerning spacy-lookup (#2814)
* Update Keras Example for (Parikh et al, 2016) implementation (#2803)
* bug fixes in keras example
* created contributor agreement
* baseline for Parikh model
* initial version of parikh 2016 implemented
* tested asymmetric models
* fixed grevious error in normalization
* use standard SNLI test file
* begin to rework parikh example
* initial version of running example
* start to document the new version
* start to document the new version
* Update Decompositional Attention.ipynb
* fixed calls to similarity
* updated the README
* import sys package duh
* simplified indexing on mapping word to IDs
* stupid python indent error
* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround
* Fix typo (closes #2815) [ci skip]
* Update regex version dependency
* Set version to 2.0.13.dev3
* Skip seemingly problematic test
* Remove problematic test
* Try previous version of regex
* Revert "Remove problematic test"
This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.
* Unskip test
* Try older version of regex
* 💫 Update training examples and use minibatching (#2830)
<!--- Provide a general summary of your changes in the title. -->
## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.
### Types of change
enhancements
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Visual C++ link updated (#2842) (closes #2841) [ci skip]
* New landing page
* Add contribution agreement
* Correcting lang/ru/examples.py (#2845)
* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement
* Correct some grammatical inaccuracies in lang\ru\examples.py
* Move contributor agreement to separate file
* Set version to 2.0.13.dev4
* Add Persian(Farsi) language support (#2797)
* Also include lowercase norm exceptions
* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors
* Rule-based French Lemmatizer (#2818)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Set version to 2.0.13
* Fix formatting and consistency
* Update docs for new version [ci skip]
* Increment version [ci skip]
* Add info on wheels [ci skip]
* Adding "This is a sentence" example to Sinhala (#2846)
* Add wheels badge
* Update badge [ci skip]
* Update README.rst [ci skip]
* Update murmurhash pin
* Increment version to 2.0.14.dev0
* Update GPU docs for v2.0.14
* Add wheel to setup_requires
* Import prefer_gpu and require_gpu functions from Thinc
* Add tests for prefer_gpu() and require_gpu()
* Update requirements and setup.py
* Workaround bug in thinc require_gpu
* Set version to v2.0.14
* Update push-tag script
* Unhack prefer_gpu
* Require thinc 6.10.6
* Update prefer_gpu and require_gpu docs [ci skip]
* Fix specifiers for GPU
* Set version to 2.0.14.dev1
* Set version to 2.0.14
* Update Thinc version pin
* Increment version
* Fix msgpack-numpy version pin
* Increment version
* Update version to 2.0.16
* Update version [ci skip]
* Redundant ')' in the Stop words' example (#2856)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Documentation improvement regarding joblib and SO (#2867)
Some documentation improvements
## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)
### Types of change
Documentation
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* raise error when setting overlapping entities as doc.ents (#2880)
* Fix out-of-bounds access in NER training
The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!
This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.
* Change PyThaiNLP Url (#2876)
* Fix missing comma
* Add example showing a fix-up rule for space entities
* Set version to 2.0.17.dev0
* Update regex version
* Revert "Update regex version"
This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.
* Try setting older regex version, to align with conda
* Set version to 2.0.17
* Add spacy-js to universe [ci-skip]
* Add spacy-raspberry to universe (closes #2889)
* Add script to validate universe json [ci skip]
* Removed space in docs + added contributor indo (#2909)
* - removed unneeded space in documentation
* - added contributor info
* Allow input text of length up to max_length, inclusive (#2922)
* Include universe spec for spacy-wordnet component (#2919)
* feat: include universe spec for spacy-wordnet component
* chore: include spaCy contributor agreement
* Minor formatting changes [ci skip]
* Fix image [ci skip]
Twitter URL doesn't work on live site
* Check if the word is in one of the regular lists specific to each POS (#2886)
* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)
Resolves #2924.
## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)
### Types of change
bug fix
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix typo [ci skip]
* fixes symbolic link on py3 and windows (#2949)
* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948
* Update spacy/compat.py
Co-Authored-By: cicorias <cicorias@users.noreply.github.com>
* Fix formatting
* Update universe [ci skip]
* Catalan Language Support (#2940)
* Catalan language Support
* Ddding Catalan to documentation
* Sort languages alphabetically [ci skip]
* Update tests for pytest 4.x (#2965)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix regex pin to harmonize with conda (#2964)
* Update README.rst
* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)
Fixes #2976
* Fix typo
* Fix typo
* Remove duplicate file
* Require thinc 7.0.0.dev2
Fixes bug in gpu_ops that would use cupy instead of numpy on CPU
* Add missing import
* Fix error IDs
* Fix tests
2018-11-29 18:30:29 +03:00
|
|
|
|
def _bulk_merge(self, spans, attributes):
|
|
|
|
|
"""Retokenize the document, such that the spans given as arguments
|
|
|
|
|
are merged into single tokens. The spans need to be in document
|
|
|
|
|
order, and no span intersection is allowed.
|
|
|
|
|
|
|
|
|
|
spans (Span[]): Spans to merge, in document order, with all span
|
2019-12-29 16:24:13 +03:00
|
|
|
|
intersections empty. Cannot be empty.
|
💫 Port master changes over to develop (#2979)
* Create aryaprabhudesai.md (#2681)
* Update _install.jade (#2688)
Typo fix: "models" -> "model"
* Add FAC to spacy.explain (resolves #2706)
* Remove docstrings for deprecated arguments (see #2703)
* When calling getoption() in conftest.py, pass a default option (#2709)
* When calling getoption() in conftest.py, pass a default option
This is necessary to allow testing an installed spacy by running:
pytest --pyargs spacy
* Add contributor agreement
* update bengali token rules for hyphen and digits (#2731)
* Less norm computations in token similarity (#2730)
* Less norm computations in token similarity
* Contributor agreement
* Remove ')' for clarity (#2737)
Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.
* added contributor agreement for mbkupfer (#2738)
* Basic support for Telugu language (#2751)
* Lex _attrs for polish language (#2750)
* Signed spaCy contributor agreement
* Added polish version of english lex_attrs
* Introduces a bulk merge function, in order to solve issue #653 (#2696)
* Fix comment
* Introduce bulk merge to increase performance on many span merges
* Sign contributor agreement
* Implement pull request suggestions
* Describe converters more explicitly (see #2643)
* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]
* Fix formatting
* Fix dependency scheme docs (closes #2705) [ci skip]
* Don't set stop word in example (closes #2657) [ci skip]
* Add words to portuguese language _num_words (#2759)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Update Indonesian model (#2752)
* adding e-KTP in tokenizer exceptions list
* add exception token
* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception
* add tokenizer exceptions list
* combining base_norms with norm_exceptions
* adding norm_exception
* fix double key in lemmatizer
* remove unused import on punctuation.py
* reformat stop_words to reduce number of lines, improve readibility
* updating tokenizer exception
* implement is_currency for lang/id
* adding orth_first_upper in tokenizer_exceptions
* update the norm_exception list
* remove bunch of abbreviations
* adding contributors file
* Fixed spaCy+Keras example (#2763)
* bug fixes in keras example
* created contributor agreement
* Adding French hyphenated first name (#2786)
* Fix typo (closes #2784)
* Fix typo (#2795) [ci skip]
Fixed typo on line 6 "regcognizer --> recognizer"
* Adding basic support for Sinhala language. (#2788)
* adding Sinhala language package, stop words, examples and lex_attrs.
* Adding contributor agreement
* Updating contributor agreement
* Also include lowercase norm exceptions
* Fix error (#2802)
* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way. Use the resize function
* added spaCy Contributor Agreement
* Add charlax's contributor agreement (#2805)
* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)
* Contributors agreement
* Contributors agreement
* Contributors agreement
* Add jupyter=True to displacy.render in documentation (#2806)
* Revert "Also include lowercase norm exceptions"
This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.
* Remove deprecated encoding argument to msgpack
* Set up dependency tree pattern matching skeleton (#2732)
* Fix bug when too many entity types. Fixes #2800
* Fix Python 2 test failure
* Require older msgpack-numpy
* Restore encoding arg on msgpack-numpy
* Try to fix version pin for msgpack-numpy
* Update Portuguese Language (#2790)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols
* Extended punctuation and norm_exceptions in the Portuguese language
* Correct error in spacy universe docs concerning spacy-lookup (#2814)
* Update Keras Example for (Parikh et al, 2016) implementation (#2803)
* bug fixes in keras example
* created contributor agreement
* baseline for Parikh model
* initial version of parikh 2016 implemented
* tested asymmetric models
* fixed grevious error in normalization
* use standard SNLI test file
* begin to rework parikh example
* initial version of running example
* start to document the new version
* start to document the new version
* Update Decompositional Attention.ipynb
* fixed calls to similarity
* updated the README
* import sys package duh
* simplified indexing on mapping word to IDs
* stupid python indent error
* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround
* Fix typo (closes #2815) [ci skip]
* Update regex version dependency
* Set version to 2.0.13.dev3
* Skip seemingly problematic test
* Remove problematic test
* Try previous version of regex
* Revert "Remove problematic test"
This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.
* Unskip test
* Try older version of regex
* 💫 Update training examples and use minibatching (#2830)
<!--- Provide a general summary of your changes in the title. -->
## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.
### Types of change
enhancements
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Visual C++ link updated (#2842) (closes #2841) [ci skip]
* New landing page
* Add contribution agreement
* Correcting lang/ru/examples.py (#2845)
* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement
* Correct some grammatical inaccuracies in lang\ru\examples.py
* Move contributor agreement to separate file
* Set version to 2.0.13.dev4
* Add Persian(Farsi) language support (#2797)
* Also include lowercase norm exceptions
* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors
* Rule-based French Lemmatizer (#2818)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Set version to 2.0.13
* Fix formatting and consistency
* Update docs for new version [ci skip]
* Increment version [ci skip]
* Add info on wheels [ci skip]
* Adding "This is a sentence" example to Sinhala (#2846)
* Add wheels badge
* Update badge [ci skip]
* Update README.rst [ci skip]
* Update murmurhash pin
* Increment version to 2.0.14.dev0
* Update GPU docs for v2.0.14
* Add wheel to setup_requires
* Import prefer_gpu and require_gpu functions from Thinc
* Add tests for prefer_gpu() and require_gpu()
* Update requirements and setup.py
* Workaround bug in thinc require_gpu
* Set version to v2.0.14
* Update push-tag script
* Unhack prefer_gpu
* Require thinc 6.10.6
* Update prefer_gpu and require_gpu docs [ci skip]
* Fix specifiers for GPU
* Set version to 2.0.14.dev1
* Set version to 2.0.14
* Update Thinc version pin
* Increment version
* Fix msgpack-numpy version pin
* Increment version
* Update version to 2.0.16
* Update version [ci skip]
* Redundant ')' in the Stop words' example (#2856)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Documentation improvement regarding joblib and SO (#2867)
Some documentation improvements
## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)
### Types of change
Documentation
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* raise error when setting overlapping entities as doc.ents (#2880)
* Fix out-of-bounds access in NER training
The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!
This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.
* Change PyThaiNLP Url (#2876)
* Fix missing comma
* Add example showing a fix-up rule for space entities
* Set version to 2.0.17.dev0
* Update regex version
* Revert "Update regex version"
This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.
* Try setting older regex version, to align with conda
* Set version to 2.0.17
* Add spacy-js to universe [ci-skip]
* Add spacy-raspberry to universe (closes #2889)
* Add script to validate universe json [ci skip]
* Removed space in docs + added contributor indo (#2909)
* - removed unneeded space in documentation
* - added contributor info
* Allow input text of length up to max_length, inclusive (#2922)
* Include universe spec for spacy-wordnet component (#2919)
* feat: include universe spec for spacy-wordnet component
* chore: include spaCy contributor agreement
* Minor formatting changes [ci skip]
* Fix image [ci skip]
Twitter URL doesn't work on live site
* Check if the word is in one of the regular lists specific to each POS (#2886)
* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)
Resolves #2924.
## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)
### Types of change
bug fix
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix typo [ci skip]
* fixes symbolic link on py3 and windows (#2949)
* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948
* Update spacy/compat.py
Co-Authored-By: cicorias <cicorias@users.noreply.github.com>
* Fix formatting
* Update universe [ci skip]
* Catalan Language Support (#2940)
* Catalan language Support
* Ddding Catalan to documentation
* Sort languages alphabetically [ci skip]
* Update tests for pytest 4.x (#2965)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix regex pin to harmonize with conda (#2964)
* Update README.rst
* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)
Fixes #2976
* Fix typo
* Fix typo
* Remove duplicate file
* Require thinc 7.0.0.dev2
Fixes bug in gpu_ops that would use cupy instead of numpy on CPU
* Add missing import
* Fix error IDs
* Fix tests
2018-11-29 18:30:29 +03:00
|
|
|
|
attributes (Dictionary[]): Attributes to assign to the merged tokens. By default,
|
2019-12-29 16:24:13 +03:00
|
|
|
|
must be the same length as spans, empty dictionaries are allowed.
|
💫 Port master changes over to develop (#2979)
* Create aryaprabhudesai.md (#2681)
* Update _install.jade (#2688)
Typo fix: "models" -> "model"
* Add FAC to spacy.explain (resolves #2706)
* Remove docstrings for deprecated arguments (see #2703)
* When calling getoption() in conftest.py, pass a default option (#2709)
* When calling getoption() in conftest.py, pass a default option
This is necessary to allow testing an installed spacy by running:
pytest --pyargs spacy
* Add contributor agreement
* update bengali token rules for hyphen and digits (#2731)
* Less norm computations in token similarity (#2730)
* Less norm computations in token similarity
* Contributor agreement
* Remove ')' for clarity (#2737)
Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.
* added contributor agreement for mbkupfer (#2738)
* Basic support for Telugu language (#2751)
* Lex _attrs for polish language (#2750)
* Signed spaCy contributor agreement
* Added polish version of english lex_attrs
* Introduces a bulk merge function, in order to solve issue #653 (#2696)
* Fix comment
* Introduce bulk merge to increase performance on many span merges
* Sign contributor agreement
* Implement pull request suggestions
* Describe converters more explicitly (see #2643)
* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]
* Fix formatting
* Fix dependency scheme docs (closes #2705) [ci skip]
* Don't set stop word in example (closes #2657) [ci skip]
* Add words to portuguese language _num_words (#2759)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Update Indonesian model (#2752)
* adding e-KTP in tokenizer exceptions list
* add exception token
* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception
* add tokenizer exceptions list
* combining base_norms with norm_exceptions
* adding norm_exception
* fix double key in lemmatizer
* remove unused import on punctuation.py
* reformat stop_words to reduce number of lines, improve readibility
* updating tokenizer exception
* implement is_currency for lang/id
* adding orth_first_upper in tokenizer_exceptions
* update the norm_exception list
* remove bunch of abbreviations
* adding contributors file
* Fixed spaCy+Keras example (#2763)
* bug fixes in keras example
* created contributor agreement
* Adding French hyphenated first name (#2786)
* Fix typo (closes #2784)
* Fix typo (#2795) [ci skip]
Fixed typo on line 6 "regcognizer --> recognizer"
* Adding basic support for Sinhala language. (#2788)
* adding Sinhala language package, stop words, examples and lex_attrs.
* Adding contributor agreement
* Updating contributor agreement
* Also include lowercase norm exceptions
* Fix error (#2802)
* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way. Use the resize function
* added spaCy Contributor Agreement
* Add charlax's contributor agreement (#2805)
* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)
* Contributors agreement
* Contributors agreement
* Contributors agreement
* Add jupyter=True to displacy.render in documentation (#2806)
* Revert "Also include lowercase norm exceptions"
This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.
* Remove deprecated encoding argument to msgpack
* Set up dependency tree pattern matching skeleton (#2732)
* Fix bug when too many entity types. Fixes #2800
* Fix Python 2 test failure
* Require older msgpack-numpy
* Restore encoding arg on msgpack-numpy
* Try to fix version pin for msgpack-numpy
* Update Portuguese Language (#2790)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols
* Extended punctuation and norm_exceptions in the Portuguese language
* Correct error in spacy universe docs concerning spacy-lookup (#2814)
* Update Keras Example for (Parikh et al, 2016) implementation (#2803)
* bug fixes in keras example
* created contributor agreement
* baseline for Parikh model
* initial version of parikh 2016 implemented
* tested asymmetric models
* fixed grevious error in normalization
* use standard SNLI test file
* begin to rework parikh example
* initial version of running example
* start to document the new version
* start to document the new version
* Update Decompositional Attention.ipynb
* fixed calls to similarity
* updated the README
* import sys package duh
* simplified indexing on mapping word to IDs
* stupid python indent error
* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround
* Fix typo (closes #2815) [ci skip]
* Update regex version dependency
* Set version to 2.0.13.dev3
* Skip seemingly problematic test
* Remove problematic test
* Try previous version of regex
* Revert "Remove problematic test"
This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.
* Unskip test
* Try older version of regex
* 💫 Update training examples and use minibatching (#2830)
<!--- Provide a general summary of your changes in the title. -->
## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.
### Types of change
enhancements
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Visual C++ link updated (#2842) (closes #2841) [ci skip]
* New landing page
* Add contribution agreement
* Correcting lang/ru/examples.py (#2845)
* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement
* Correct some grammatical inaccuracies in lang\ru\examples.py
* Move contributor agreement to separate file
* Set version to 2.0.13.dev4
* Add Persian(Farsi) language support (#2797)
* Also include lowercase norm exceptions
* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors
* Rule-based French Lemmatizer (#2818)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Set version to 2.0.13
* Fix formatting and consistency
* Update docs for new version [ci skip]
* Increment version [ci skip]
* Add info on wheels [ci skip]
* Adding "This is a sentence" example to Sinhala (#2846)
* Add wheels badge
* Update badge [ci skip]
* Update README.rst [ci skip]
* Update murmurhash pin
* Increment version to 2.0.14.dev0
* Update GPU docs for v2.0.14
* Add wheel to setup_requires
* Import prefer_gpu and require_gpu functions from Thinc
* Add tests for prefer_gpu() and require_gpu()
* Update requirements and setup.py
* Workaround bug in thinc require_gpu
* Set version to v2.0.14
* Update push-tag script
* Unhack prefer_gpu
* Require thinc 6.10.6
* Update prefer_gpu and require_gpu docs [ci skip]
* Fix specifiers for GPU
* Set version to 2.0.14.dev1
* Set version to 2.0.14
* Update Thinc version pin
* Increment version
* Fix msgpack-numpy version pin
* Increment version
* Update version to 2.0.16
* Update version [ci skip]
* Redundant ')' in the Stop words' example (#2856)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Documentation improvement regarding joblib and SO (#2867)
Some documentation improvements
## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)
### Types of change
Documentation
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* raise error when setting overlapping entities as doc.ents (#2880)
* Fix out-of-bounds access in NER training
The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!
This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.
* Change PyThaiNLP Url (#2876)
* Fix missing comma
* Add example showing a fix-up rule for space entities
* Set version to 2.0.17.dev0
* Update regex version
* Revert "Update regex version"
This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.
* Try setting older regex version, to align with conda
* Set version to 2.0.17
* Add spacy-js to universe [ci-skip]
* Add spacy-raspberry to universe (closes #2889)
* Add script to validate universe json [ci skip]
* Removed space in docs + added contributor indo (#2909)
* - removed unneeded space in documentation
* - added contributor info
* Allow input text of length up to max_length, inclusive (#2922)
* Include universe spec for spacy-wordnet component (#2919)
* feat: include universe spec for spacy-wordnet component
* chore: include spaCy contributor agreement
* Minor formatting changes [ci skip]
* Fix image [ci skip]
Twitter URL doesn't work on live site
* Check if the word is in one of the regular lists specific to each POS (#2886)
* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)
Resolves #2924.
## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)
### Types of change
bug fix
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix typo [ci skip]
* fixes symbolic link on py3 and windows (#2949)
* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948
* Update spacy/compat.py
Co-Authored-By: cicorias <cicorias@users.noreply.github.com>
* Fix formatting
* Update universe [ci skip]
* Catalan Language Support (#2940)
* Catalan language Support
* Ddding Catalan to documentation
* Sort languages alphabetically [ci skip]
* Update tests for pytest 4.x (#2965)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix regex pin to harmonize with conda (#2964)
* Update README.rst
* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)
Fixes #2976
* Fix typo
* Fix typo
* Remove duplicate file
* Require thinc 7.0.0.dev2
Fixes bug in gpu_ops that would use cupy instead of numpy on CPU
* Add missing import
* Fix error IDs
* Fix tests
2018-11-29 18:30:29 +03:00
|
|
|
|
attributes are inherited from the syntactic root of the span.
|
|
|
|
|
RETURNS (Token): The first newly merged token.
|
|
|
|
|
"""
|
2021-09-13 18:02:17 +03:00
|
|
|
|
cdef str tag, lemma, ent_type
|
2019-03-08 13:42:26 +03:00
|
|
|
|
attr_len = len(attributes)
|
|
|
|
|
span_len = len(spans)
|
|
|
|
|
if not attr_len == span_len:
|
|
|
|
|
raise ValueError(Errors.E121.format(attr_len=attr_len, span_len=span_len))
|
💫 Port master changes over to develop (#2979)
* Create aryaprabhudesai.md (#2681)
* Update _install.jade (#2688)
Typo fix: "models" -> "model"
* Add FAC to spacy.explain (resolves #2706)
* Remove docstrings for deprecated arguments (see #2703)
* When calling getoption() in conftest.py, pass a default option (#2709)
* When calling getoption() in conftest.py, pass a default option
This is necessary to allow testing an installed spacy by running:
pytest --pyargs spacy
* Add contributor agreement
* update bengali token rules for hyphen and digits (#2731)
* Less norm computations in token similarity (#2730)
* Less norm computations in token similarity
* Contributor agreement
* Remove ')' for clarity (#2737)
Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.
* added contributor agreement for mbkupfer (#2738)
* Basic support for Telugu language (#2751)
* Lex _attrs for polish language (#2750)
* Signed spaCy contributor agreement
* Added polish version of english lex_attrs
* Introduces a bulk merge function, in order to solve issue #653 (#2696)
* Fix comment
* Introduce bulk merge to increase performance on many span merges
* Sign contributor agreement
* Implement pull request suggestions
* Describe converters more explicitly (see #2643)
* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]
* Fix formatting
* Fix dependency scheme docs (closes #2705) [ci skip]
* Don't set stop word in example (closes #2657) [ci skip]
* Add words to portuguese language _num_words (#2759)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Update Indonesian model (#2752)
* adding e-KTP in tokenizer exceptions list
* add exception token
* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception
* add tokenizer exceptions list
* combining base_norms with norm_exceptions
* adding norm_exception
* fix double key in lemmatizer
* remove unused import on punctuation.py
* reformat stop_words to reduce number of lines, improve readibility
* updating tokenizer exception
* implement is_currency for lang/id
* adding orth_first_upper in tokenizer_exceptions
* update the norm_exception list
* remove bunch of abbreviations
* adding contributors file
* Fixed spaCy+Keras example (#2763)
* bug fixes in keras example
* created contributor agreement
* Adding French hyphenated first name (#2786)
* Fix typo (closes #2784)
* Fix typo (#2795) [ci skip]
Fixed typo on line 6 "regcognizer --> recognizer"
* Adding basic support for Sinhala language. (#2788)
* adding Sinhala language package, stop words, examples and lex_attrs.
* Adding contributor agreement
* Updating contributor agreement
* Also include lowercase norm exceptions
* Fix error (#2802)
* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way. Use the resize function
* added spaCy Contributor Agreement
* Add charlax's contributor agreement (#2805)
* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)
* Contributors agreement
* Contributors agreement
* Contributors agreement
* Add jupyter=True to displacy.render in documentation (#2806)
* Revert "Also include lowercase norm exceptions"
This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.
* Remove deprecated encoding argument to msgpack
* Set up dependency tree pattern matching skeleton (#2732)
* Fix bug when too many entity types. Fixes #2800
* Fix Python 2 test failure
* Require older msgpack-numpy
* Restore encoding arg on msgpack-numpy
* Try to fix version pin for msgpack-numpy
* Update Portuguese Language (#2790)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols
* Extended punctuation and norm_exceptions in the Portuguese language
* Correct error in spacy universe docs concerning spacy-lookup (#2814)
* Update Keras Example for (Parikh et al, 2016) implementation (#2803)
* bug fixes in keras example
* created contributor agreement
* baseline for Parikh model
* initial version of parikh 2016 implemented
* tested asymmetric models
* fixed grevious error in normalization
* use standard SNLI test file
* begin to rework parikh example
* initial version of running example
* start to document the new version
* start to document the new version
* Update Decompositional Attention.ipynb
* fixed calls to similarity
* updated the README
* import sys package duh
* simplified indexing on mapping word to IDs
* stupid python indent error
* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround
* Fix typo (closes #2815) [ci skip]
* Update regex version dependency
* Set version to 2.0.13.dev3
* Skip seemingly problematic test
* Remove problematic test
* Try previous version of regex
* Revert "Remove problematic test"
This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.
* Unskip test
* Try older version of regex
* 💫 Update training examples and use minibatching (#2830)
<!--- Provide a general summary of your changes in the title. -->
## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.
### Types of change
enhancements
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Visual C++ link updated (#2842) (closes #2841) [ci skip]
* New landing page
* Add contribution agreement
* Correcting lang/ru/examples.py (#2845)
* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement
* Correct some grammatical inaccuracies in lang\ru\examples.py
* Move contributor agreement to separate file
* Set version to 2.0.13.dev4
* Add Persian(Farsi) language support (#2797)
* Also include lowercase norm exceptions
* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors
* Rule-based French Lemmatizer (#2818)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Set version to 2.0.13
* Fix formatting and consistency
* Update docs for new version [ci skip]
* Increment version [ci skip]
* Add info on wheels [ci skip]
* Adding "This is a sentence" example to Sinhala (#2846)
* Add wheels badge
* Update badge [ci skip]
* Update README.rst [ci skip]
* Update murmurhash pin
* Increment version to 2.0.14.dev0
* Update GPU docs for v2.0.14
* Add wheel to setup_requires
* Import prefer_gpu and require_gpu functions from Thinc
* Add tests for prefer_gpu() and require_gpu()
* Update requirements and setup.py
* Workaround bug in thinc require_gpu
* Set version to v2.0.14
* Update push-tag script
* Unhack prefer_gpu
* Require thinc 6.10.6
* Update prefer_gpu and require_gpu docs [ci skip]
* Fix specifiers for GPU
* Set version to 2.0.14.dev1
* Set version to 2.0.14
* Update Thinc version pin
* Increment version
* Fix msgpack-numpy version pin
* Increment version
* Update version to 2.0.16
* Update version [ci skip]
* Redundant ')' in the Stop words' example (#2856)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Documentation improvement regarding joblib and SO (#2867)
Some documentation improvements
## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)
### Types of change
Documentation
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* raise error when setting overlapping entities as doc.ents (#2880)
* Fix out-of-bounds access in NER training
The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!
This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.
* Change PyThaiNLP Url (#2876)
* Fix missing comma
* Add example showing a fix-up rule for space entities
* Set version to 2.0.17.dev0
* Update regex version
* Revert "Update regex version"
This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.
* Try setting older regex version, to align with conda
* Set version to 2.0.17
* Add spacy-js to universe [ci-skip]
* Add spacy-raspberry to universe (closes #2889)
* Add script to validate universe json [ci skip]
* Removed space in docs + added contributor indo (#2909)
* - removed unneeded space in documentation
* - added contributor info
* Allow input text of length up to max_length, inclusive (#2922)
* Include universe spec for spacy-wordnet component (#2919)
* feat: include universe spec for spacy-wordnet component
* chore: include spaCy contributor agreement
* Minor formatting changes [ci skip]
* Fix image [ci skip]
Twitter URL doesn't work on live site
* Check if the word is in one of the regular lists specific to each POS (#2886)
* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)
Resolves #2924.
## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)
### Types of change
bug fix
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix typo [ci skip]
* fixes symbolic link on py3 and windows (#2949)
* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948
* Update spacy/compat.py
Co-Authored-By: cicorias <cicorias@users.noreply.github.com>
* Fix formatting
* Update universe [ci skip]
* Catalan Language Support (#2940)
* Catalan language Support
* Ddding Catalan to documentation
* Sort languages alphabetically [ci skip]
* Update tests for pytest 4.x (#2965)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix regex pin to harmonize with conda (#2964)
* Update README.rst
* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)
Fixes #2976
* Fix typo
* Fix typo
* Remove duplicate file
* Require thinc 7.0.0.dev2
Fixes bug in gpu_ops that would use cupy instead of numpy on CPU
* Add missing import
* Fix error IDs
* Fix tests
2018-11-29 18:30:29 +03:00
|
|
|
|
with self.retokenize() as retokenizer:
|
|
|
|
|
for i, span in enumerate(spans):
|
|
|
|
|
fix_attributes(self, attributes[i])
|
|
|
|
|
remove_label_if_necessary(attributes[i])
|
|
|
|
|
retokenizer.merge(span, attributes[i])
|
|
|
|
|
|
2018-11-30 22:16:14 +03:00
|
|
|
|
def to_json(self, underscore=None):
|
2020-08-21 16:05:40 +03:00
|
|
|
|
"""Convert a Doc to JSON.
|
2017-05-18 23:17:09 +03:00
|
|
|
|
|
2018-11-30 22:16:14 +03:00
|
|
|
|
underscore (list): Optional list of string names of custom doc._.
|
|
|
|
|
attributes. Attribute values need to be JSON-serializable. Values will
|
|
|
|
|
be added to an "_" key in the data, e.g. "_": {"foo": "bar"}.
|
2022-03-14 17:47:57 +03:00
|
|
|
|
RETURNS (dict): The data in JSON format.
|
2017-05-18 23:17:09 +03:00
|
|
|
|
"""
|
2019-03-08 13:42:26 +03:00
|
|
|
|
data = {"text": self.text}
|
2020-09-17 01:14:01 +03:00
|
|
|
|
if self.has_annotation("ENT_IOB"):
|
2019-03-08 13:42:26 +03:00
|
|
|
|
data["ents"] = [{"start": ent.start_char, "end": ent.end_char,
|
|
|
|
|
"label": ent.label_} for ent in self.ents]
|
2020-09-17 01:14:01 +03:00
|
|
|
|
if self.has_annotation("SENT_START"):
|
2019-03-10 17:24:34 +03:00
|
|
|
|
sents = list(self.sents)
|
2019-03-08 13:42:26 +03:00
|
|
|
|
data["sents"] = [{"start": sent.start_char, "end": sent.end_char}
|
2018-11-30 22:16:14 +03:00
|
|
|
|
for sent in sents]
|
|
|
|
|
if self.cats:
|
2019-03-08 13:42:26 +03:00
|
|
|
|
data["cats"] = self.cats
|
|
|
|
|
data["tokens"] = []
|
2020-09-17 01:14:01 +03:00
|
|
|
|
attrs = ["TAG", "MORPH", "POS", "LEMMA", "DEP"]
|
|
|
|
|
include_annotation = {attr: self.has_annotation(attr) for attr in attrs}
|
2018-11-30 22:16:14 +03:00
|
|
|
|
for token in self:
|
2019-03-08 13:42:26 +03:00
|
|
|
|
token_data = {"id": token.i, "start": token.idx, "end": token.idx + len(token)}
|
2020-09-17 01:14:01 +03:00
|
|
|
|
if include_annotation["TAG"]:
|
2019-03-08 13:42:26 +03:00
|
|
|
|
token_data["tag"] = token.tag_
|
2020-09-17 01:14:01 +03:00
|
|
|
|
if include_annotation["POS"]:
|
|
|
|
|
token_data["pos"] = token.pos_
|
|
|
|
|
if include_annotation["MORPH"]:
|
2020-10-13 12:39:03 +03:00
|
|
|
|
token_data["morph"] = token.morph.to_json()
|
2020-09-17 01:14:01 +03:00
|
|
|
|
if include_annotation["LEMMA"]:
|
|
|
|
|
token_data["lemma"] = token.lemma_
|
|
|
|
|
if include_annotation["DEP"]:
|
2019-03-08 13:42:26 +03:00
|
|
|
|
token_data["dep"] = token.dep_
|
|
|
|
|
token_data["head"] = token.head.i
|
|
|
|
|
data["tokens"].append(token_data)
|
2022-03-14 17:47:57 +03:00
|
|
|
|
|
|
|
|
|
if self.spans:
|
|
|
|
|
data["spans"] = {}
|
|
|
|
|
for span_group in self.spans:
|
|
|
|
|
data["spans"][span_group] = []
|
|
|
|
|
for span in self.spans[span_group]:
|
|
|
|
|
span_data = {"start": span.start_char, "end": span.end_char, "label": span.label_, "kb_id": span.kb_id_}
|
|
|
|
|
data["spans"][span_group].append(span_data)
|
|
|
|
|
|
2018-11-30 22:16:14 +03:00
|
|
|
|
if underscore:
|
2019-03-08 13:42:26 +03:00
|
|
|
|
data["_"] = {}
|
2018-11-30 22:16:14 +03:00
|
|
|
|
for attr in underscore:
|
|
|
|
|
if not self.has_extension(attr):
|
|
|
|
|
raise ValueError(Errors.E106.format(attr=attr, opts=underscore))
|
|
|
|
|
value = self._.get(attr)
|
💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)
Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉
See here: https://github.com/explosion/srsly
Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.
At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.
srsly currently includes forks of the following packages:
ujson
msgpack
msgpack-numpy
cloudpickle
* WIP: replace json/ujson with srsly
* Replace ujson in examples
Use regular json instead of srsly to make code easier to read and follow
* Update requirements
* Fix imports
* Fix typos
* Replace msgpack with srsly
* Fix warning
2018-12-03 03:28:22 +03:00
|
|
|
|
if not srsly.is_json_serializable(value):
|
2018-11-30 22:16:14 +03:00
|
|
|
|
raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
|
2019-03-08 13:42:26 +03:00
|
|
|
|
data["_"][attr] = value
|
2018-11-30 22:16:14 +03:00
|
|
|
|
return data
|
2016-12-30 20:19:18 +03:00
|
|
|
|
|
2019-03-09 14:50:27 +03:00
|
|
|
|
def to_utf8_array(self, int nr_char=-1):
|
|
|
|
|
"""Encode word strings to utf8, and export to a fixed-width array
|
|
|
|
|
of characters. Characters are placed into the array in the order:
|
|
|
|
|
0, -1, 1, -2, etc
|
|
|
|
|
For example, if the array is sliced array[:, :8], the array will
|
|
|
|
|
contain the first 4 characters and last 4 characters of each word ---
|
|
|
|
|
with the middle characters clipped out. The value 255 is used as a pad
|
|
|
|
|
value.
|
|
|
|
|
"""
|
|
|
|
|
byte_strings = [token.orth_.encode('utf8') for token in self]
|
|
|
|
|
if nr_char == -1:
|
|
|
|
|
nr_char = max(len(bs) for bs in byte_strings)
|
|
|
|
|
cdef np.ndarray output = numpy.zeros((len(byte_strings), nr_char), dtype='uint8')
|
|
|
|
|
output.fill(255)
|
|
|
|
|
cdef int i, j, start_idx, end_idx
|
|
|
|
|
cdef bytes byte_string
|
|
|
|
|
cdef unsigned char utf8_char
|
|
|
|
|
for i, byte_string in enumerate(byte_strings):
|
|
|
|
|
j = 0
|
|
|
|
|
start_idx = 0
|
|
|
|
|
end_idx = len(byte_string) - 1
|
|
|
|
|
while j < nr_char and start_idx <= end_idx:
|
|
|
|
|
output[i, j] = <unsigned char>byte_string[start_idx]
|
|
|
|
|
start_idx += 1
|
|
|
|
|
j += 1
|
|
|
|
|
if j < nr_char and start_idx <= end_idx:
|
|
|
|
|
output[i, j] = <unsigned char>byte_string[end_idx]
|
|
|
|
|
end_idx -= 1
|
|
|
|
|
j += 1
|
|
|
|
|
return output
|
|
|
|
|
|
2020-09-17 01:14:01 +03:00
|
|
|
|
@staticmethod
|
|
|
|
|
def _get_array_attrs():
|
|
|
|
|
attrs = [LENGTH, SPACY]
|
|
|
|
|
attrs.extend(intify_attr(x) for x in DOCBIN_ALL_ATTRS)
|
|
|
|
|
return tuple(attrs)
|
|
|
|
|
|
2015-07-30 03:29:49 +03:00
|
|
|
|
|
2015-11-07 00:55:34 +03:00
|
|
|
|
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
|
2020-08-04 14:36:32 +03:00
|
|
|
|
cdef int i = token_by_char(tokens, length, start_char)
|
|
|
|
|
if i >= 0 and tokens[i].idx == start_char:
|
|
|
|
|
return i
|
2015-11-07 00:55:34 +03:00
|
|
|
|
else:
|
|
|
|
|
return -1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2:
|
2020-08-04 14:36:32 +03:00
|
|
|
|
# end_char is exclusive, so find the token at one char before
|
|
|
|
|
cdef int i = token_by_char(tokens, length, end_char - 1)
|
|
|
|
|
if i >= 0 and tokens[i].idx + tokens[i].lex.length == end_char:
|
|
|
|
|
return i
|
2015-11-07 00:55:34 +03:00
|
|
|
|
else:
|
|
|
|
|
return -1
|
|
|
|
|
|
|
|
|
|
|
2020-08-04 14:36:32 +03:00
|
|
|
|
cdef int token_by_char(const TokenC* tokens, int length, int char_idx) except -2:
|
|
|
|
|
cdef int start = 0, mid, end = length - 1
|
|
|
|
|
while start <= end:
|
|
|
|
|
mid = (start + end) / 2
|
|
|
|
|
if char_idx < tokens[mid].idx:
|
|
|
|
|
end = mid - 1
|
|
|
|
|
elif char_idx >= tokens[mid].idx + tokens[mid].lex.length + tokens[mid].spacy:
|
|
|
|
|
start = mid + 1
|
|
|
|
|
else:
|
|
|
|
|
return mid
|
|
|
|
|
return -1
|
|
|
|
|
|
2020-09-16 21:32:38 +03:00
|
|
|
|
cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
|
|
|
|
|
# note: end is exclusive
|
2015-07-30 03:29:49 +03:00
|
|
|
|
cdef TokenC* head
|
|
|
|
|
cdef TokenC* child
|
|
|
|
|
cdef int i
|
2015-10-18 09:17:27 +03:00
|
|
|
|
# Set number of left/right children to 0. We'll increment it in the loops.
|
2020-09-16 21:32:38 +03:00
|
|
|
|
for i in range(start, end):
|
2015-10-18 09:17:27 +03:00
|
|
|
|
tokens[i].l_kids = 0
|
|
|
|
|
tokens[i].r_kids = 0
|
|
|
|
|
tokens[i].l_edge = i
|
|
|
|
|
tokens[i].r_edge = i
|
2019-11-25 15:06:36 +03:00
|
|
|
|
cdef int loop_count = 0
|
|
|
|
|
cdef bint heads_within_sents = False
|
|
|
|
|
# Try up to 10 iterations of adjusting lr_kids and lr_edges in order to
|
|
|
|
|
# handle non-projective dependency parses, stopping when all heads are
|
|
|
|
|
# within their respective sentence boundaries. We have documented cases
|
|
|
|
|
# that need at least 4 iterations, so this is to be on the safe side
|
|
|
|
|
# without risking getting stuck in an infinite loop if something is
|
|
|
|
|
# terribly malformed.
|
|
|
|
|
while not heads_within_sents:
|
2020-09-16 21:32:38 +03:00
|
|
|
|
heads_within_sents = _set_lr_kids_and_edges(tokens, start, end, loop_count)
|
2019-11-25 15:06:36 +03:00
|
|
|
|
if loop_count > 10:
|
2020-10-04 23:44:21 +03:00
|
|
|
|
util.logger.debug(Warnings.W026)
|
2020-03-03 14:29:05 +03:00
|
|
|
|
break
|
2019-11-25 15:06:36 +03:00
|
|
|
|
loop_count += 1
|
2015-11-03 10:14:53 +03:00
|
|
|
|
# Set sentence starts
|
2020-09-16 21:32:38 +03:00
|
|
|
|
for i in range(start, end):
|
|
|
|
|
tokens[i].sent_start = -1
|
|
|
|
|
for i in range(start, end):
|
2021-01-13 15:47:25 +03:00
|
|
|
|
if tokens[i].head == 0 and not Token.missing_head(&tokens[i]):
|
2020-09-17 01:14:01 +03:00
|
|
|
|
tokens[tokens[i].l_edge].sent_start = 1
|
2017-02-27 00:27:11 +03:00
|
|
|
|
|
2017-10-17 17:11:13 +03:00
|
|
|
|
|
2020-09-16 21:32:38 +03:00
|
|
|
|
cdef int _set_lr_kids_and_edges(TokenC* tokens, int start, int end, int loop_count) except -1:
|
2019-11-25 15:06:36 +03:00
|
|
|
|
# May be called multiple times due to non-projectivity. See issues #3170
|
|
|
|
|
# and #4688.
|
|
|
|
|
# Set left edges
|
|
|
|
|
cdef TokenC* head
|
|
|
|
|
cdef TokenC* child
|
|
|
|
|
cdef int i, j
|
2020-09-16 21:32:38 +03:00
|
|
|
|
for i in range(start, end):
|
2019-11-25 15:06:36 +03:00
|
|
|
|
child = &tokens[i]
|
|
|
|
|
head = &tokens[i + child.head]
|
2020-09-16 21:32:38 +03:00
|
|
|
|
if loop_count == 0 and child < head:
|
2019-11-25 15:06:36 +03:00
|
|
|
|
head.l_kids += 1
|
|
|
|
|
if child.l_edge < head.l_edge:
|
|
|
|
|
head.l_edge = child.l_edge
|
|
|
|
|
if child.r_edge > head.r_edge:
|
|
|
|
|
head.r_edge = child.r_edge
|
|
|
|
|
# Set right edges - same as above, but iterate in reverse
|
2020-09-16 21:32:38 +03:00
|
|
|
|
for i in range(end-1, start-1, -1):
|
2019-11-25 15:06:36 +03:00
|
|
|
|
child = &tokens[i]
|
|
|
|
|
head = &tokens[i + child.head]
|
2020-09-16 21:32:38 +03:00
|
|
|
|
if loop_count == 0 and child > head:
|
2019-11-25 15:06:36 +03:00
|
|
|
|
head.r_kids += 1
|
|
|
|
|
if child.r_edge > head.r_edge:
|
|
|
|
|
head.r_edge = child.r_edge
|
|
|
|
|
if child.l_edge < head.l_edge:
|
|
|
|
|
head.l_edge = child.l_edge
|
|
|
|
|
# Get sentence start positions according to current state
|
|
|
|
|
sent_starts = set()
|
2020-09-16 21:32:38 +03:00
|
|
|
|
for i in range(start, end):
|
|
|
|
|
if tokens[i].head == 0:
|
2019-11-25 15:06:36 +03:00
|
|
|
|
sent_starts.add(tokens[i].l_edge)
|
|
|
|
|
cdef int curr_sent_start = 0
|
|
|
|
|
cdef int curr_sent_end = 0
|
|
|
|
|
# Check whether any heads are not within the current sentence
|
2020-09-16 21:32:38 +03:00
|
|
|
|
for i in range(start, end):
|
|
|
|
|
if (i > 0 and i in sent_starts) or i == end - 1:
|
2019-11-25 15:06:36 +03:00
|
|
|
|
curr_sent_end = i
|
|
|
|
|
for j in range(curr_sent_start, curr_sent_end):
|
|
|
|
|
if tokens[j].head + j < curr_sent_start or tokens[j].head + j >= curr_sent_end + 1:
|
|
|
|
|
return False
|
|
|
|
|
curr_sent_start = i
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
2018-12-29 20:02:26 +03:00
|
|
|
|
cdef int _get_tokens_lca(Token token_j, Token token_k):
|
|
|
|
|
"""Given two tokens, returns the index of the lowest common ancestor
|
|
|
|
|
(LCA) among the two. If they have no common ancestor, -1 is returned.
|
|
|
|
|
|
|
|
|
|
token_j (Token): a token.
|
|
|
|
|
token_k (Token): another token.
|
|
|
|
|
RETURNS (int): index of lowest common ancestor, or -1 if the tokens
|
|
|
|
|
have no common ancestor.
|
|
|
|
|
"""
|
|
|
|
|
if token_j == token_k:
|
|
|
|
|
return token_j.i
|
|
|
|
|
elif token_j.head == token_k:
|
|
|
|
|
return token_k.i
|
|
|
|
|
elif token_k.head == token_j:
|
|
|
|
|
return token_j.i
|
|
|
|
|
token_j_ancestors = set(token_j.ancestors)
|
|
|
|
|
if token_k in token_j_ancestors:
|
|
|
|
|
return token_k.i
|
|
|
|
|
for token_k_ancestor in token_k.ancestors:
|
|
|
|
|
if token_k_ancestor == token_j:
|
|
|
|
|
return token_j.i
|
|
|
|
|
if token_k_ancestor in token_j_ancestors:
|
|
|
|
|
return token_k_ancestor.i
|
|
|
|
|
return -1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
|
|
|
|
|
"""Given a doc and a start and end position defining a set of contiguous
|
|
|
|
|
tokens within it, returns a matrix of Lowest Common Ancestors (LCA), where
|
|
|
|
|
LCA[i, j] is the index of the lowest common ancestor among token i and j.
|
|
|
|
|
If the tokens have no common ancestor within the specified span,
|
|
|
|
|
LCA[i, j] will be -1.
|
|
|
|
|
|
|
|
|
|
doc (Doc): The index of the token, or the slice of the document
|
|
|
|
|
start (int): First token to be included in the LCA matrix.
|
|
|
|
|
end (int): Position of next to last token included in the LCA matrix.
|
|
|
|
|
RETURNS (int [:, :]): memoryview of numpy.array[ndim=2, dtype=numpy.int32],
|
|
|
|
|
with shape (n, n), where n = len(doc).
|
|
|
|
|
"""
|
|
|
|
|
cdef int [:,:] lca_matrix
|
2020-09-16 21:32:38 +03:00
|
|
|
|
cdef int j, k
|
2018-12-29 20:02:26 +03:00
|
|
|
|
n_tokens= end - start
|
2019-01-06 21:07:50 +03:00
|
|
|
|
lca_mat = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32)
|
|
|
|
|
lca_mat.fill(-1)
|
|
|
|
|
lca_matrix = lca_mat
|
|
|
|
|
for j in range(n_tokens):
|
|
|
|
|
token_j = doc[start + j]
|
2018-12-29 20:02:26 +03:00
|
|
|
|
# the common ancestor of token and itself is itself:
|
|
|
|
|
lca_matrix[j, j] = j
|
2019-01-06 21:07:50 +03:00
|
|
|
|
# we will only iterate through tokens in the same sentence
|
|
|
|
|
sent = token_j.sent
|
|
|
|
|
sent_start = sent.start
|
|
|
|
|
j_idx_in_sent = start + j - sent_start
|
|
|
|
|
n_missing_tokens_in_sent = len(sent) - j_idx_in_sent
|
|
|
|
|
# make sure we do not go past `end`, in cases where `end` < sent.end
|
2021-05-17 17:54:23 +03:00
|
|
|
|
max_range = min(j + n_missing_tokens_in_sent, end - start)
|
2019-01-06 21:07:50 +03:00
|
|
|
|
for k in range(j + 1, max_range):
|
|
|
|
|
lca = _get_tokens_lca(token_j, doc[start + k])
|
2018-12-29 20:02:26 +03:00
|
|
|
|
# if lca is outside of span, we set it to -1
|
|
|
|
|
if not start <= lca < end:
|
|
|
|
|
lca_matrix[j, k] = -1
|
|
|
|
|
lca_matrix[k, j] = -1
|
|
|
|
|
else:
|
2019-01-06 21:07:50 +03:00
|
|
|
|
lca_matrix[j, k] = lca - start
|
|
|
|
|
lca_matrix[k, j] = lca - start
|
2018-12-29 20:02:26 +03:00
|
|
|
|
return lca_matrix
|
|
|
|
|
|
|
|
|
|
|
2017-10-17 17:11:13 +03:00
|
|
|
|
def pickle_doc(doc):
|
2020-12-29 13:54:32 +03:00
|
|
|
|
bytes_data = doc.to_bytes(exclude=["vocab", "user_data", "user_hooks"])
|
2017-10-17 20:44:09 +03:00
|
|
|
|
hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks,
|
2021-11-03 11:14:29 +03:00
|
|
|
|
doc.user_token_hooks, doc._context)
|
2018-12-06 20:46:09 +03:00
|
|
|
|
return (unpickle_doc, (doc.vocab, srsly.pickle_dumps(hooks_and_data), bytes_data))
|
2017-10-17 17:11:13 +03:00
|
|
|
|
|
|
|
|
|
|
2017-10-17 20:44:09 +03:00
|
|
|
|
def unpickle_doc(vocab, hooks_and_data, bytes_data):
|
2021-11-03 11:14:29 +03:00
|
|
|
|
user_data, doc_hooks, span_hooks, token_hooks, _context = srsly.pickle_loads(hooks_and_data)
|
2017-10-27 16:41:45 +03:00
|
|
|
|
|
2019-03-10 21:16:45 +03:00
|
|
|
|
doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data, exclude=["user_data"])
|
2017-10-17 20:44:09 +03:00
|
|
|
|
doc.user_hooks.update(doc_hooks)
|
|
|
|
|
doc.user_span_hooks.update(span_hooks)
|
|
|
|
|
doc.user_token_hooks.update(token_hooks)
|
2021-11-03 11:14:29 +03:00
|
|
|
|
doc._context = _context
|
2017-10-17 17:11:13 +03:00
|
|
|
|
return doc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
copy_reg.pickle(Doc, pickle_doc, unpickle_doc)
|
💫 Port master changes over to develop (#2979)
* Create aryaprabhudesai.md (#2681)
* Update _install.jade (#2688)
Typo fix: "models" -> "model"
* Add FAC to spacy.explain (resolves #2706)
* Remove docstrings for deprecated arguments (see #2703)
* When calling getoption() in conftest.py, pass a default option (#2709)
* When calling getoption() in conftest.py, pass a default option
This is necessary to allow testing an installed spacy by running:
pytest --pyargs spacy
* Add contributor agreement
* update bengali token rules for hyphen and digits (#2731)
* Less norm computations in token similarity (#2730)
* Less norm computations in token similarity
* Contributor agreement
* Remove ')' for clarity (#2737)
Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.
* added contributor agreement for mbkupfer (#2738)
* Basic support for Telugu language (#2751)
* Lex _attrs for polish language (#2750)
* Signed spaCy contributor agreement
* Added polish version of english lex_attrs
* Introduces a bulk merge function, in order to solve issue #653 (#2696)
* Fix comment
* Introduce bulk merge to increase performance on many span merges
* Sign contributor agreement
* Implement pull request suggestions
* Describe converters more explicitly (see #2643)
* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]
* Fix formatting
* Fix dependency scheme docs (closes #2705) [ci skip]
* Don't set stop word in example (closes #2657) [ci skip]
* Add words to portuguese language _num_words (#2759)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Update Indonesian model (#2752)
* adding e-KTP in tokenizer exceptions list
* add exception token
* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception
* add tokenizer exceptions list
* combining base_norms with norm_exceptions
* adding norm_exception
* fix double key in lemmatizer
* remove unused import on punctuation.py
* reformat stop_words to reduce number of lines, improve readibility
* updating tokenizer exception
* implement is_currency for lang/id
* adding orth_first_upper in tokenizer_exceptions
* update the norm_exception list
* remove bunch of abbreviations
* adding contributors file
* Fixed spaCy+Keras example (#2763)
* bug fixes in keras example
* created contributor agreement
* Adding French hyphenated first name (#2786)
* Fix typo (closes #2784)
* Fix typo (#2795) [ci skip]
Fixed typo on line 6 "regcognizer --> recognizer"
* Adding basic support for Sinhala language. (#2788)
* adding Sinhala language package, stop words, examples and lex_attrs.
* Adding contributor agreement
* Updating contributor agreement
* Also include lowercase norm exceptions
* Fix error (#2802)
* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way. Use the resize function
* added spaCy Contributor Agreement
* Add charlax's contributor agreement (#2805)
* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)
* Contributors agreement
* Contributors agreement
* Contributors agreement
* Add jupyter=True to displacy.render in documentation (#2806)
* Revert "Also include lowercase norm exceptions"
This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.
* Remove deprecated encoding argument to msgpack
* Set up dependency tree pattern matching skeleton (#2732)
* Fix bug when too many entity types. Fixes #2800
* Fix Python 2 test failure
* Require older msgpack-numpy
* Restore encoding arg on msgpack-numpy
* Try to fix version pin for msgpack-numpy
* Update Portuguese Language (#2790)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols
* Extended punctuation and norm_exceptions in the Portuguese language
* Correct error in spacy universe docs concerning spacy-lookup (#2814)
* Update Keras Example for (Parikh et al, 2016) implementation (#2803)
* bug fixes in keras example
* created contributor agreement
* baseline for Parikh model
* initial version of parikh 2016 implemented
* tested asymmetric models
* fixed grevious error in normalization
* use standard SNLI test file
* begin to rework parikh example
* initial version of running example
* start to document the new version
* start to document the new version
* Update Decompositional Attention.ipynb
* fixed calls to similarity
* updated the README
* import sys package duh
* simplified indexing on mapping word to IDs
* stupid python indent error
* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround
* Fix typo (closes #2815) [ci skip]
* Update regex version dependency
* Set version to 2.0.13.dev3
* Skip seemingly problematic test
* Remove problematic test
* Try previous version of regex
* Revert "Remove problematic test"
This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.
* Unskip test
* Try older version of regex
* 💫 Update training examples and use minibatching (#2830)
<!--- Provide a general summary of your changes in the title. -->
## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.
### Types of change
enhancements
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Visual C++ link updated (#2842) (closes #2841) [ci skip]
* New landing page
* Add contribution agreement
* Correcting lang/ru/examples.py (#2845)
* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement
* Correct some grammatical inaccuracies in lang\ru\examples.py
* Move contributor agreement to separate file
* Set version to 2.0.13.dev4
* Add Persian(Farsi) language support (#2797)
* Also include lowercase norm exceptions
* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors
* Rule-based French Lemmatizer (#2818)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Set version to 2.0.13
* Fix formatting and consistency
* Update docs for new version [ci skip]
* Increment version [ci skip]
* Add info on wheels [ci skip]
* Adding "This is a sentence" example to Sinhala (#2846)
* Add wheels badge
* Update badge [ci skip]
* Update README.rst [ci skip]
* Update murmurhash pin
* Increment version to 2.0.14.dev0
* Update GPU docs for v2.0.14
* Add wheel to setup_requires
* Import prefer_gpu and require_gpu functions from Thinc
* Add tests for prefer_gpu() and require_gpu()
* Update requirements and setup.py
* Workaround bug in thinc require_gpu
* Set version to v2.0.14
* Update push-tag script
* Unhack prefer_gpu
* Require thinc 6.10.6
* Update prefer_gpu and require_gpu docs [ci skip]
* Fix specifiers for GPU
* Set version to 2.0.14.dev1
* Set version to 2.0.14
* Update Thinc version pin
* Increment version
* Fix msgpack-numpy version pin
* Increment version
* Update version to 2.0.16
* Update version [ci skip]
* Redundant ')' in the Stop words' example (#2856)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Documentation improvement regarding joblib and SO (#2867)
Some documentation improvements
## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)
### Types of change
Documentation
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* raise error when setting overlapping entities as doc.ents (#2880)
* Fix out-of-bounds access in NER training
The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!
This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.
* Change PyThaiNLP Url (#2876)
* Fix missing comma
* Add example showing a fix-up rule for space entities
* Set version to 2.0.17.dev0
* Update regex version
* Revert "Update regex version"
This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.
* Try setting older regex version, to align with conda
* Set version to 2.0.17
* Add spacy-js to universe [ci-skip]
* Add spacy-raspberry to universe (closes #2889)
* Add script to validate universe json [ci skip]
* Removed space in docs + added contributor indo (#2909)
* - removed unneeded space in documentation
* - added contributor info
* Allow input text of length up to max_length, inclusive (#2922)
* Include universe spec for spacy-wordnet component (#2919)
* feat: include universe spec for spacy-wordnet component
* chore: include spaCy contributor agreement
* Minor formatting changes [ci skip]
* Fix image [ci skip]
Twitter URL doesn't work on live site
* Check if the word is in one of the regular lists specific to each POS (#2886)
* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)
Resolves #2924.
## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)
### Types of change
bug fix
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix typo [ci skip]
* fixes symbolic link on py3 and windows (#2949)
* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948
* Update spacy/compat.py
Co-Authored-By: cicorias <cicorias@users.noreply.github.com>
* Fix formatting
* Update universe [ci skip]
* Catalan Language Support (#2940)
* Catalan language Support
* Ddding Catalan to documentation
* Sort languages alphabetically [ci skip]
* Update tests for pytest 4.x (#2965)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix regex pin to harmonize with conda (#2964)
* Update README.rst
* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)
Fixes #2976
* Fix typo
* Fix typo
* Remove duplicate file
* Require thinc 7.0.0.dev2
Fixes bug in gpu_ops that would use cupy instead of numpy on CPU
* Add missing import
* Fix error IDs
* Fix tests
2018-11-29 18:30:29 +03:00
|
|
|
|
|
2019-03-08 13:42:26 +03:00
|
|
|
|
|
💫 Port master changes over to develop (#2979)
* Create aryaprabhudesai.md (#2681)
* Update _install.jade (#2688)
Typo fix: "models" -> "model"
* Add FAC to spacy.explain (resolves #2706)
* Remove docstrings for deprecated arguments (see #2703)
* When calling getoption() in conftest.py, pass a default option (#2709)
* When calling getoption() in conftest.py, pass a default option
This is necessary to allow testing an installed spacy by running:
pytest --pyargs spacy
* Add contributor agreement
* update bengali token rules for hyphen and digits (#2731)
* Less norm computations in token similarity (#2730)
* Less norm computations in token similarity
* Contributor agreement
* Remove ')' for clarity (#2737)
Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.
* added contributor agreement for mbkupfer (#2738)
* Basic support for Telugu language (#2751)
* Lex _attrs for polish language (#2750)
* Signed spaCy contributor agreement
* Added polish version of english lex_attrs
* Introduces a bulk merge function, in order to solve issue #653 (#2696)
* Fix comment
* Introduce bulk merge to increase performance on many span merges
* Sign contributor agreement
* Implement pull request suggestions
* Describe converters more explicitly (see #2643)
* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]
* Fix formatting
* Fix dependency scheme docs (closes #2705) [ci skip]
* Don't set stop word in example (closes #2657) [ci skip]
* Add words to portuguese language _num_words (#2759)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Update Indonesian model (#2752)
* adding e-KTP in tokenizer exceptions list
* add exception token
* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception
* add tokenizer exceptions list
* combining base_norms with norm_exceptions
* adding norm_exception
* fix double key in lemmatizer
* remove unused import on punctuation.py
* reformat stop_words to reduce number of lines, improve readibility
* updating tokenizer exception
* implement is_currency for lang/id
* adding orth_first_upper in tokenizer_exceptions
* update the norm_exception list
* remove bunch of abbreviations
* adding contributors file
* Fixed spaCy+Keras example (#2763)
* bug fixes in keras example
* created contributor agreement
* Adding French hyphenated first name (#2786)
* Fix typo (closes #2784)
* Fix typo (#2795) [ci skip]
Fixed typo on line 6 "regcognizer --> recognizer"
* Adding basic support for Sinhala language. (#2788)
* adding Sinhala language package, stop words, examples and lex_attrs.
* Adding contributor agreement
* Updating contributor agreement
* Also include lowercase norm exceptions
* Fix error (#2802)
* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way. Use the resize function
* added spaCy Contributor Agreement
* Add charlax's contributor agreement (#2805)
* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)
* Contributors agreement
* Contributors agreement
* Contributors agreement
* Add jupyter=True to displacy.render in documentation (#2806)
* Revert "Also include lowercase norm exceptions"
This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.
* Remove deprecated encoding argument to msgpack
* Set up dependency tree pattern matching skeleton (#2732)
* Fix bug when too many entity types. Fixes #2800
* Fix Python 2 test failure
* Require older msgpack-numpy
* Restore encoding arg on msgpack-numpy
* Try to fix version pin for msgpack-numpy
* Update Portuguese Language (#2790)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols
* Extended punctuation and norm_exceptions in the Portuguese language
* Correct error in spacy universe docs concerning spacy-lookup (#2814)
* Update Keras Example for (Parikh et al, 2016) implementation (#2803)
* bug fixes in keras example
* created contributor agreement
* baseline for Parikh model
* initial version of parikh 2016 implemented
* tested asymmetric models
* fixed grevious error in normalization
* use standard SNLI test file
* begin to rework parikh example
* initial version of running example
* start to document the new version
* start to document the new version
* Update Decompositional Attention.ipynb
* fixed calls to similarity
* updated the README
* import sys package duh
* simplified indexing on mapping word to IDs
* stupid python indent error
* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround
* Fix typo (closes #2815) [ci skip]
* Update regex version dependency
* Set version to 2.0.13.dev3
* Skip seemingly problematic test
* Remove problematic test
* Try previous version of regex
* Revert "Remove problematic test"
This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.
* Unskip test
* Try older version of regex
* 💫 Update training examples and use minibatching (#2830)
<!--- Provide a general summary of your changes in the title. -->
## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.
### Types of change
enhancements
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Visual C++ link updated (#2842) (closes #2841) [ci skip]
* New landing page
* Add contribution agreement
* Correcting lang/ru/examples.py (#2845)
* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement
* Correct some grammatical inaccuracies in lang\ru\examples.py
* Move contributor agreement to separate file
* Set version to 2.0.13.dev4
* Add Persian(Farsi) language support (#2797)
* Also include lowercase norm exceptions
* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors
* Rule-based French Lemmatizer (#2818)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Set version to 2.0.13
* Fix formatting and consistency
* Update docs for new version [ci skip]
* Increment version [ci skip]
* Add info on wheels [ci skip]
* Adding "This is a sentence" example to Sinhala (#2846)
* Add wheels badge
* Update badge [ci skip]
* Update README.rst [ci skip]
* Update murmurhash pin
* Increment version to 2.0.14.dev0
* Update GPU docs for v2.0.14
* Add wheel to setup_requires
* Import prefer_gpu and require_gpu functions from Thinc
* Add tests for prefer_gpu() and require_gpu()
* Update requirements and setup.py
* Workaround bug in thinc require_gpu
* Set version to v2.0.14
* Update push-tag script
* Unhack prefer_gpu
* Require thinc 6.10.6
* Update prefer_gpu and require_gpu docs [ci skip]
* Fix specifiers for GPU
* Set version to 2.0.14.dev1
* Set version to 2.0.14
* Update Thinc version pin
* Increment version
* Fix msgpack-numpy version pin
* Increment version
* Update version to 2.0.16
* Update version [ci skip]
* Redundant ')' in the Stop words' example (#2856)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Documentation improvement regarding joblib and SO (#2867)
Some documentation improvements
## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)
### Types of change
Documentation
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* raise error when setting overlapping entities as doc.ents (#2880)
* Fix out-of-bounds access in NER training
The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!
This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.
* Change PyThaiNLP Url (#2876)
* Fix missing comma
* Add example showing a fix-up rule for space entities
* Set version to 2.0.17.dev0
* Update regex version
* Revert "Update regex version"
This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.
* Try setting older regex version, to align with conda
* Set version to 2.0.17
* Add spacy-js to universe [ci-skip]
* Add spacy-raspberry to universe (closes #2889)
* Add script to validate universe json [ci skip]
* Removed space in docs + added contributor indo (#2909)
* - removed unneeded space in documentation
* - added contributor info
* Allow input text of length up to max_length, inclusive (#2922)
* Include universe spec for spacy-wordnet component (#2919)
* feat: include universe spec for spacy-wordnet component
* chore: include spaCy contributor agreement
* Minor formatting changes [ci skip]
* Fix image [ci skip]
Twitter URL doesn't work on live site
* Check if the word is in one of the regular lists specific to each POS (#2886)
* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)
Resolves #2924.
## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)
### Types of change
bug fix
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix typo [ci skip]
* fixes symbolic link on py3 and windows (#2949)
* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948
* Update spacy/compat.py
Co-Authored-By: cicorias <cicorias@users.noreply.github.com>
* Fix formatting
* Update universe [ci skip]
* Catalan Language Support (#2940)
* Catalan language Support
* Ddding Catalan to documentation
* Sort languages alphabetically [ci skip]
* Update tests for pytest 4.x (#2965)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix regex pin to harmonize with conda (#2964)
* Update README.rst
* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)
Fixes #2976
* Fix typo
* Fix typo
* Remove duplicate file
* Require thinc 7.0.0.dev2
Fixes bug in gpu_ops that would use cupy instead of numpy on CPU
* Add missing import
* Fix error IDs
* Fix tests
2018-11-29 18:30:29 +03:00
|
|
|
|
def remove_label_if_necessary(attributes):
|
|
|
|
|
# More deprecated attribute handling =/
|
2019-03-08 13:42:26 +03:00
|
|
|
|
if "label" in attributes:
|
|
|
|
|
attributes["ent_type"] = attributes.pop("label")
|
|
|
|
|
|
💫 Port master changes over to develop (#2979)
* Create aryaprabhudesai.md (#2681)
* Update _install.jade (#2688)
Typo fix: "models" -> "model"
* Add FAC to spacy.explain (resolves #2706)
* Remove docstrings for deprecated arguments (see #2703)
* When calling getoption() in conftest.py, pass a default option (#2709)
* When calling getoption() in conftest.py, pass a default option
This is necessary to allow testing an installed spacy by running:
pytest --pyargs spacy
* Add contributor agreement
* update bengali token rules for hyphen and digits (#2731)
* Less norm computations in token similarity (#2730)
* Less norm computations in token similarity
* Contributor agreement
* Remove ')' for clarity (#2737)
Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.
* added contributor agreement for mbkupfer (#2738)
* Basic support for Telugu language (#2751)
* Lex _attrs for polish language (#2750)
* Signed spaCy contributor agreement
* Added polish version of english lex_attrs
* Introduces a bulk merge function, in order to solve issue #653 (#2696)
* Fix comment
* Introduce bulk merge to increase performance on many span merges
* Sign contributor agreement
* Implement pull request suggestions
* Describe converters more explicitly (see #2643)
* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]
* Fix formatting
* Fix dependency scheme docs (closes #2705) [ci skip]
* Don't set stop word in example (closes #2657) [ci skip]
* Add words to portuguese language _num_words (#2759)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Update Indonesian model (#2752)
* adding e-KTP in tokenizer exceptions list
* add exception token
* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception
* add tokenizer exceptions list
* combining base_norms with norm_exceptions
* adding norm_exception
* fix double key in lemmatizer
* remove unused import on punctuation.py
* reformat stop_words to reduce number of lines, improve readibility
* updating tokenizer exception
* implement is_currency for lang/id
* adding orth_first_upper in tokenizer_exceptions
* update the norm_exception list
* remove bunch of abbreviations
* adding contributors file
* Fixed spaCy+Keras example (#2763)
* bug fixes in keras example
* created contributor agreement
* Adding French hyphenated first name (#2786)
* Fix typo (closes #2784)
* Fix typo (#2795) [ci skip]
Fixed typo on line 6 "regcognizer --> recognizer"
* Adding basic support for Sinhala language. (#2788)
* adding Sinhala language package, stop words, examples and lex_attrs.
* Adding contributor agreement
* Updating contributor agreement
* Also include lowercase norm exceptions
* Fix error (#2802)
* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way. Use the resize function
* added spaCy Contributor Agreement
* Add charlax's contributor agreement (#2805)
* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)
* Contributors agreement
* Contributors agreement
* Contributors agreement
* Add jupyter=True to displacy.render in documentation (#2806)
* Revert "Also include lowercase norm exceptions"
This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.
* Remove deprecated encoding argument to msgpack
* Set up dependency tree pattern matching skeleton (#2732)
* Fix bug when too many entity types. Fixes #2800
* Fix Python 2 test failure
* Require older msgpack-numpy
* Restore encoding arg on msgpack-numpy
* Try to fix version pin for msgpack-numpy
* Update Portuguese Language (#2790)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols
* Extended punctuation and norm_exceptions in the Portuguese language
* Correct error in spacy universe docs concerning spacy-lookup (#2814)
* Update Keras Example for (Parikh et al, 2016) implementation (#2803)
* bug fixes in keras example
* created contributor agreement
* baseline for Parikh model
* initial version of parikh 2016 implemented
* tested asymmetric models
* fixed grevious error in normalization
* use standard SNLI test file
* begin to rework parikh example
* initial version of running example
* start to document the new version
* start to document the new version
* Update Decompositional Attention.ipynb
* fixed calls to similarity
* updated the README
* import sys package duh
* simplified indexing on mapping word to IDs
* stupid python indent error
* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround
* Fix typo (closes #2815) [ci skip]
* Update regex version dependency
* Set version to 2.0.13.dev3
* Skip seemingly problematic test
* Remove problematic test
* Try previous version of regex
* Revert "Remove problematic test"
This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.
* Unskip test
* Try older version of regex
* 💫 Update training examples and use minibatching (#2830)
<!--- Provide a general summary of your changes in the title. -->
## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.
### Types of change
enhancements
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Visual C++ link updated (#2842) (closes #2841) [ci skip]
* New landing page
* Add contribution agreement
* Correcting lang/ru/examples.py (#2845)
* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement
* Correct some grammatical inaccuracies in lang\ru\examples.py
* Move contributor agreement to separate file
* Set version to 2.0.13.dev4
* Add Persian(Farsi) language support (#2797)
* Also include lowercase norm exceptions
* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors
* Rule-based French Lemmatizer (#2818)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Set version to 2.0.13
* Fix formatting and consistency
* Update docs for new version [ci skip]
* Increment version [ci skip]
* Add info on wheels [ci skip]
* Adding "This is a sentence" example to Sinhala (#2846)
* Add wheels badge
* Update badge [ci skip]
* Update README.rst [ci skip]
* Update murmurhash pin
* Increment version to 2.0.14.dev0
* Update GPU docs for v2.0.14
* Add wheel to setup_requires
* Import prefer_gpu and require_gpu functions from Thinc
* Add tests for prefer_gpu() and require_gpu()
* Update requirements and setup.py
* Workaround bug in thinc require_gpu
* Set version to v2.0.14
* Update push-tag script
* Unhack prefer_gpu
* Require thinc 6.10.6
* Update prefer_gpu and require_gpu docs [ci skip]
* Fix specifiers for GPU
* Set version to 2.0.14.dev1
* Set version to 2.0.14
* Update Thinc version pin
* Increment version
* Fix msgpack-numpy version pin
* Increment version
* Update version to 2.0.16
* Update version [ci skip]
* Redundant ')' in the Stop words' example (#2856)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Documentation improvement regarding joblib and SO (#2867)
Some documentation improvements
## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)
### Types of change
Documentation
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* raise error when setting overlapping entities as doc.ents (#2880)
* Fix out-of-bounds access in NER training
The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!
This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.
* Change PyThaiNLP Url (#2876)
* Fix missing comma
* Add example showing a fix-up rule for space entities
* Set version to 2.0.17.dev0
* Update regex version
* Revert "Update regex version"
This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.
* Try setting older regex version, to align with conda
* Set version to 2.0.17
* Add spacy-js to universe [ci-skip]
* Add spacy-raspberry to universe (closes #2889)
* Add script to validate universe json [ci skip]
* Removed space in docs + added contributor indo (#2909)
* - removed unneeded space in documentation
* - added contributor info
* Allow input text of length up to max_length, inclusive (#2922)
* Include universe spec for spacy-wordnet component (#2919)
* feat: include universe spec for spacy-wordnet component
* chore: include spaCy contributor agreement
* Minor formatting changes [ci skip]
* Fix image [ci skip]
Twitter URL doesn't work on live site
* Check if the word is in one of the regular lists specific to each POS (#2886)
* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)
Resolves #2924.
## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)
### Types of change
bug fix
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix typo [ci skip]
* fixes symbolic link on py3 and windows (#2949)
* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948
* Update spacy/compat.py
Co-Authored-By: cicorias <cicorias@users.noreply.github.com>
* Fix formatting
* Update universe [ci skip]
* Catalan Language Support (#2940)
* Catalan language Support
* Ddding Catalan to documentation
* Sort languages alphabetically [ci skip]
* Update tests for pytest 4.x (#2965)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix regex pin to harmonize with conda (#2964)
* Update README.rst
* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)
Fixes #2976
* Fix typo
* Fix typo
* Remove duplicate file
* Require thinc 7.0.0.dev2
Fixes bug in gpu_ops that would use cupy instead of numpy on CPU
* Add missing import
* Fix error IDs
* Fix tests
2018-11-29 18:30:29 +03:00
|
|
|
|
|
|
|
|
|
def fix_attributes(doc, attributes):
|
2019-03-08 13:42:26 +03:00
|
|
|
|
if "label" in attributes and "ent_type" not in attributes:
|
|
|
|
|
if isinstance(attributes["label"], int):
|
|
|
|
|
attributes[ENT_TYPE] = attributes["label"]
|
💫 Port master changes over to develop (#2979)
* Create aryaprabhudesai.md (#2681)
* Update _install.jade (#2688)
Typo fix: "models" -> "model"
* Add FAC to spacy.explain (resolves #2706)
* Remove docstrings for deprecated arguments (see #2703)
* When calling getoption() in conftest.py, pass a default option (#2709)
* When calling getoption() in conftest.py, pass a default option
This is necessary to allow testing an installed spacy by running:
pytest --pyargs spacy
* Add contributor agreement
* update bengali token rules for hyphen and digits (#2731)
* Less norm computations in token similarity (#2730)
* Less norm computations in token similarity
* Contributor agreement
* Remove ')' for clarity (#2737)
Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.
* added contributor agreement for mbkupfer (#2738)
* Basic support for Telugu language (#2751)
* Lex _attrs for polish language (#2750)
* Signed spaCy contributor agreement
* Added polish version of english lex_attrs
* Introduces a bulk merge function, in order to solve issue #653 (#2696)
* Fix comment
* Introduce bulk merge to increase performance on many span merges
* Sign contributor agreement
* Implement pull request suggestions
* Describe converters more explicitly (see #2643)
* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]
* Fix formatting
* Fix dependency scheme docs (closes #2705) [ci skip]
* Don't set stop word in example (closes #2657) [ci skip]
* Add words to portuguese language _num_words (#2759)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Update Indonesian model (#2752)
* adding e-KTP in tokenizer exceptions list
* add exception token
* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception
* add tokenizer exceptions list
* combining base_norms with norm_exceptions
* adding norm_exception
* fix double key in lemmatizer
* remove unused import on punctuation.py
* reformat stop_words to reduce number of lines, improve readibility
* updating tokenizer exception
* implement is_currency for lang/id
* adding orth_first_upper in tokenizer_exceptions
* update the norm_exception list
* remove bunch of abbreviations
* adding contributors file
* Fixed spaCy+Keras example (#2763)
* bug fixes in keras example
* created contributor agreement
* Adding French hyphenated first name (#2786)
* Fix typo (closes #2784)
* Fix typo (#2795) [ci skip]
Fixed typo on line 6 "regcognizer --> recognizer"
* Adding basic support for Sinhala language. (#2788)
* adding Sinhala language package, stop words, examples and lex_attrs.
* Adding contributor agreement
* Updating contributor agreement
* Also include lowercase norm exceptions
* Fix error (#2802)
* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way. Use the resize function
* added spaCy Contributor Agreement
* Add charlax's contributor agreement (#2805)
* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)
* Contributors agreement
* Contributors agreement
* Contributors agreement
* Add jupyter=True to displacy.render in documentation (#2806)
* Revert "Also include lowercase norm exceptions"
This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.
* Remove deprecated encoding argument to msgpack
* Set up dependency tree pattern matching skeleton (#2732)
* Fix bug when too many entity types. Fixes #2800
* Fix Python 2 test failure
* Require older msgpack-numpy
* Restore encoding arg on msgpack-numpy
* Try to fix version pin for msgpack-numpy
* Update Portuguese Language (#2790)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols
* Extended punctuation and norm_exceptions in the Portuguese language
* Correct error in spacy universe docs concerning spacy-lookup (#2814)
* Update Keras Example for (Parikh et al, 2016) implementation (#2803)
* bug fixes in keras example
* created contributor agreement
* baseline for Parikh model
* initial version of parikh 2016 implemented
* tested asymmetric models
* fixed grevious error in normalization
* use standard SNLI test file
* begin to rework parikh example
* initial version of running example
* start to document the new version
* start to document the new version
* Update Decompositional Attention.ipynb
* fixed calls to similarity
* updated the README
* import sys package duh
* simplified indexing on mapping word to IDs
* stupid python indent error
* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround
* Fix typo (closes #2815) [ci skip]
* Update regex version dependency
* Set version to 2.0.13.dev3
* Skip seemingly problematic test
* Remove problematic test
* Try previous version of regex
* Revert "Remove problematic test"
This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.
* Unskip test
* Try older version of regex
* 💫 Update training examples and use minibatching (#2830)
<!--- Provide a general summary of your changes in the title. -->
## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.
### Types of change
enhancements
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Visual C++ link updated (#2842) (closes #2841) [ci skip]
* New landing page
* Add contribution agreement
* Correcting lang/ru/examples.py (#2845)
* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement
* Correct some grammatical inaccuracies in lang\ru\examples.py
* Move contributor agreement to separate file
* Set version to 2.0.13.dev4
* Add Persian(Farsi) language support (#2797)
* Also include lowercase norm exceptions
* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors
* Rule-based French Lemmatizer (#2818)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Set version to 2.0.13
* Fix formatting and consistency
* Update docs for new version [ci skip]
* Increment version [ci skip]
* Add info on wheels [ci skip]
* Adding "This is a sentence" example to Sinhala (#2846)
* Add wheels badge
* Update badge [ci skip]
* Update README.rst [ci skip]
* Update murmurhash pin
* Increment version to 2.0.14.dev0
* Update GPU docs for v2.0.14
* Add wheel to setup_requires
* Import prefer_gpu and require_gpu functions from Thinc
* Add tests for prefer_gpu() and require_gpu()
* Update requirements and setup.py
* Workaround bug in thinc require_gpu
* Set version to v2.0.14
* Update push-tag script
* Unhack prefer_gpu
* Require thinc 6.10.6
* Update prefer_gpu and require_gpu docs [ci skip]
* Fix specifiers for GPU
* Set version to 2.0.14.dev1
* Set version to 2.0.14
* Update Thinc version pin
* Increment version
* Fix msgpack-numpy version pin
* Increment version
* Update version to 2.0.16
* Update version [ci skip]
* Redundant ')' in the Stop words' example (#2856)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Documentation improvement regarding joblib and SO (#2867)
Some documentation improvements
## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)
### Types of change
Documentation
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* raise error when setting overlapping entities as doc.ents (#2880)
* Fix out-of-bounds access in NER training
The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!
This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.
* Change PyThaiNLP Url (#2876)
* Fix missing comma
* Add example showing a fix-up rule for space entities
* Set version to 2.0.17.dev0
* Update regex version
* Revert "Update regex version"
This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.
* Try setting older regex version, to align with conda
* Set version to 2.0.17
* Add spacy-js to universe [ci-skip]
* Add spacy-raspberry to universe (closes #2889)
* Add script to validate universe json [ci skip]
* Removed space in docs + added contributor indo (#2909)
* - removed unneeded space in documentation
* - added contributor info
* Allow input text of length up to max_length, inclusive (#2922)
* Include universe spec for spacy-wordnet component (#2919)
* feat: include universe spec for spacy-wordnet component
* chore: include spaCy contributor agreement
* Minor formatting changes [ci skip]
* Fix image [ci skip]
Twitter URL doesn't work on live site
* Check if the word is in one of the regular lists specific to each POS (#2886)
* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)
Resolves #2924.
## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)
### Types of change
bug fix
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix typo [ci skip]
* fixes symbolic link on py3 and windows (#2949)
* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948
* Update spacy/compat.py
Co-Authored-By: cicorias <cicorias@users.noreply.github.com>
* Fix formatting
* Update universe [ci skip]
* Catalan Language Support (#2940)
* Catalan language Support
* Ddding Catalan to documentation
* Sort languages alphabetically [ci skip]
* Update tests for pytest 4.x (#2965)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix regex pin to harmonize with conda (#2964)
* Update README.rst
* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)
Fixes #2976
* Fix typo
* Fix typo
* Remove duplicate file
* Require thinc 7.0.0.dev2
Fixes bug in gpu_ops that would use cupy instead of numpy on CPU
* Add missing import
* Fix error IDs
* Fix tests
2018-11-29 18:30:29 +03:00
|
|
|
|
else:
|
2019-03-08 13:42:26 +03:00
|
|
|
|
attributes[ENT_TYPE] = doc.vocab.strings[attributes["label"]]
|
|
|
|
|
if "ent_type" in attributes:
|
|
|
|
|
attributes[ENT_TYPE] = attributes["ent_type"]
|
|
|
|
|
|
💫 Port master changes over to develop (#2979)
* Create aryaprabhudesai.md (#2681)
* Update _install.jade (#2688)
Typo fix: "models" -> "model"
* Add FAC to spacy.explain (resolves #2706)
* Remove docstrings for deprecated arguments (see #2703)
* When calling getoption() in conftest.py, pass a default option (#2709)
* When calling getoption() in conftest.py, pass a default option
This is necessary to allow testing an installed spacy by running:
pytest --pyargs spacy
* Add contributor agreement
* update bengali token rules for hyphen and digits (#2731)
* Less norm computations in token similarity (#2730)
* Less norm computations in token similarity
* Contributor agreement
* Remove ')' for clarity (#2737)
Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.
* added contributor agreement for mbkupfer (#2738)
* Basic support for Telugu language (#2751)
* Lex _attrs for polish language (#2750)
* Signed spaCy contributor agreement
* Added polish version of english lex_attrs
* Introduces a bulk merge function, in order to solve issue #653 (#2696)
* Fix comment
* Introduce bulk merge to increase performance on many span merges
* Sign contributor agreement
* Implement pull request suggestions
* Describe converters more explicitly (see #2643)
* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]
* Fix formatting
* Fix dependency scheme docs (closes #2705) [ci skip]
* Don't set stop word in example (closes #2657) [ci skip]
* Add words to portuguese language _num_words (#2759)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Update Indonesian model (#2752)
* adding e-KTP in tokenizer exceptions list
* add exception token
* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception
* add tokenizer exceptions list
* combining base_norms with norm_exceptions
* adding norm_exception
* fix double key in lemmatizer
* remove unused import on punctuation.py
* reformat stop_words to reduce number of lines, improve readibility
* updating tokenizer exception
* implement is_currency for lang/id
* adding orth_first_upper in tokenizer_exceptions
* update the norm_exception list
* remove bunch of abbreviations
* adding contributors file
* Fixed spaCy+Keras example (#2763)
* bug fixes in keras example
* created contributor agreement
* Adding French hyphenated first name (#2786)
* Fix typo (closes #2784)
* Fix typo (#2795) [ci skip]
Fixed typo on line 6 "regcognizer --> recognizer"
* Adding basic support for Sinhala language. (#2788)
* adding Sinhala language package, stop words, examples and lex_attrs.
* Adding contributor agreement
* Updating contributor agreement
* Also include lowercase norm exceptions
* Fix error (#2802)
* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way. Use the resize function
* added spaCy Contributor Agreement
* Add charlax's contributor agreement (#2805)
* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)
* Contributors agreement
* Contributors agreement
* Contributors agreement
* Add jupyter=True to displacy.render in documentation (#2806)
* Revert "Also include lowercase norm exceptions"
This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.
* Remove deprecated encoding argument to msgpack
* Set up dependency tree pattern matching skeleton (#2732)
* Fix bug when too many entity types. Fixes #2800
* Fix Python 2 test failure
* Require older msgpack-numpy
* Restore encoding arg on msgpack-numpy
* Try to fix version pin for msgpack-numpy
* Update Portuguese Language (#2790)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols
* Extended punctuation and norm_exceptions in the Portuguese language
* Correct error in spacy universe docs concerning spacy-lookup (#2814)
* Update Keras Example for (Parikh et al, 2016) implementation (#2803)
* bug fixes in keras example
* created contributor agreement
* baseline for Parikh model
* initial version of parikh 2016 implemented
* tested asymmetric models
* fixed grevious error in normalization
* use standard SNLI test file
* begin to rework parikh example
* initial version of running example
* start to document the new version
* start to document the new version
* Update Decompositional Attention.ipynb
* fixed calls to similarity
* updated the README
* import sys package duh
* simplified indexing on mapping word to IDs
* stupid python indent error
* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround
* Fix typo (closes #2815) [ci skip]
* Update regex version dependency
* Set version to 2.0.13.dev3
* Skip seemingly problematic test
* Remove problematic test
* Try previous version of regex
* Revert "Remove problematic test"
This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.
* Unskip test
* Try older version of regex
* 💫 Update training examples and use minibatching (#2830)
<!--- Provide a general summary of your changes in the title. -->
## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.
### Types of change
enhancements
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Visual C++ link updated (#2842) (closes #2841) [ci skip]
* New landing page
* Add contribution agreement
* Correcting lang/ru/examples.py (#2845)
* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement
* Correct some grammatical inaccuracies in lang\ru\examples.py
* Move contributor agreement to separate file
* Set version to 2.0.13.dev4
* Add Persian(Farsi) language support (#2797)
* Also include lowercase norm exceptions
* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors
* Rule-based French Lemmatizer (#2818)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Set version to 2.0.13
* Fix formatting and consistency
* Update docs for new version [ci skip]
* Increment version [ci skip]
* Add info on wheels [ci skip]
* Adding "This is a sentence" example to Sinhala (#2846)
* Add wheels badge
* Update badge [ci skip]
* Update README.rst [ci skip]
* Update murmurhash pin
* Increment version to 2.0.14.dev0
* Update GPU docs for v2.0.14
* Add wheel to setup_requires
* Import prefer_gpu and require_gpu functions from Thinc
* Add tests for prefer_gpu() and require_gpu()
* Update requirements and setup.py
* Workaround bug in thinc require_gpu
* Set version to v2.0.14
* Update push-tag script
* Unhack prefer_gpu
* Require thinc 6.10.6
* Update prefer_gpu and require_gpu docs [ci skip]
* Fix specifiers for GPU
* Set version to 2.0.14.dev1
* Set version to 2.0.14
* Update Thinc version pin
* Increment version
* Fix msgpack-numpy version pin
* Increment version
* Update version to 2.0.16
* Update version [ci skip]
* Redundant ')' in the Stop words' example (#2856)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Documentation improvement regarding joblib and SO (#2867)
Some documentation improvements
## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)
### Types of change
Documentation
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* raise error when setting overlapping entities as doc.ents (#2880)
* Fix out-of-bounds access in NER training
The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!
This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.
* Change PyThaiNLP Url (#2876)
* Fix missing comma
* Add example showing a fix-up rule for space entities
* Set version to 2.0.17.dev0
* Update regex version
* Revert "Update regex version"
This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.
* Try setting older regex version, to align with conda
* Set version to 2.0.17
* Add spacy-js to universe [ci-skip]
* Add spacy-raspberry to universe (closes #2889)
* Add script to validate universe json [ci skip]
* Removed space in docs + added contributor indo (#2909)
* - removed unneeded space in documentation
* - added contributor info
* Allow input text of length up to max_length, inclusive (#2922)
* Include universe spec for spacy-wordnet component (#2919)
* feat: include universe spec for spacy-wordnet component
* chore: include spaCy contributor agreement
* Minor formatting changes [ci skip]
* Fix image [ci skip]
Twitter URL doesn't work on live site
* Check if the word is in one of the regular lists specific to each POS (#2886)
* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)
Resolves #2924.
## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)
### Types of change
bug fix
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix typo [ci skip]
* fixes symbolic link on py3 and windows (#2949)
* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948
* Update spacy/compat.py
Co-Authored-By: cicorias <cicorias@users.noreply.github.com>
* Fix formatting
* Update universe [ci skip]
* Catalan Language Support (#2940)
* Catalan language Support
* Ddding Catalan to documentation
* Sort languages alphabetically [ci skip]
* Update tests for pytest 4.x (#2965)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix regex pin to harmonize with conda (#2964)
* Update README.rst
* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)
Fixes #2976
* Fix typo
* Fix typo
* Remove duplicate file
* Require thinc 7.0.0.dev2
Fixes bug in gpu_ops that would use cupy instead of numpy on CPU
* Add missing import
* Fix error IDs
* Fix tests
2018-11-29 18:30:29 +03:00
|
|
|
|
|
|
|
|
|
def get_entity_info(ent_info):
|
|
|
|
|
if isinstance(ent_info, Span):
|
|
|
|
|
ent_type = ent_info.label
|
2019-09-16 16:18:37 +03:00
|
|
|
|
ent_kb_id = ent_info.kb_id
|
💫 Port master changes over to develop (#2979)
* Create aryaprabhudesai.md (#2681)
* Update _install.jade (#2688)
Typo fix: "models" -> "model"
* Add FAC to spacy.explain (resolves #2706)
* Remove docstrings for deprecated arguments (see #2703)
* When calling getoption() in conftest.py, pass a default option (#2709)
* When calling getoption() in conftest.py, pass a default option
This is necessary to allow testing an installed spacy by running:
pytest --pyargs spacy
* Add contributor agreement
* update bengali token rules for hyphen and digits (#2731)
* Less norm computations in token similarity (#2730)
* Less norm computations in token similarity
* Contributor agreement
* Remove ')' for clarity (#2737)
Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.
* added contributor agreement for mbkupfer (#2738)
* Basic support for Telugu language (#2751)
* Lex _attrs for polish language (#2750)
* Signed spaCy contributor agreement
* Added polish version of english lex_attrs
* Introduces a bulk merge function, in order to solve issue #653 (#2696)
* Fix comment
* Introduce bulk merge to increase performance on many span merges
* Sign contributor agreement
* Implement pull request suggestions
* Describe converters more explicitly (see #2643)
* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]
* Fix formatting
* Fix dependency scheme docs (closes #2705) [ci skip]
* Don't set stop word in example (closes #2657) [ci skip]
* Add words to portuguese language _num_words (#2759)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Update Indonesian model (#2752)
* adding e-KTP in tokenizer exceptions list
* add exception token
* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception
* add tokenizer exceptions list
* combining base_norms with norm_exceptions
* adding norm_exception
* fix double key in lemmatizer
* remove unused import on punctuation.py
* reformat stop_words to reduce number of lines, improve readibility
* updating tokenizer exception
* implement is_currency for lang/id
* adding orth_first_upper in tokenizer_exceptions
* update the norm_exception list
* remove bunch of abbreviations
* adding contributors file
* Fixed spaCy+Keras example (#2763)
* bug fixes in keras example
* created contributor agreement
* Adding French hyphenated first name (#2786)
* Fix typo (closes #2784)
* Fix typo (#2795) [ci skip]
Fixed typo on line 6 "regcognizer --> recognizer"
* Adding basic support for Sinhala language. (#2788)
* adding Sinhala language package, stop words, examples and lex_attrs.
* Adding contributor agreement
* Updating contributor agreement
* Also include lowercase norm exceptions
* Fix error (#2802)
* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way. Use the resize function
* added spaCy Contributor Agreement
* Add charlax's contributor agreement (#2805)
* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)
* Contributors agreement
* Contributors agreement
* Contributors agreement
* Add jupyter=True to displacy.render in documentation (#2806)
* Revert "Also include lowercase norm exceptions"
This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.
* Remove deprecated encoding argument to msgpack
* Set up dependency tree pattern matching skeleton (#2732)
* Fix bug when too many entity types. Fixes #2800
* Fix Python 2 test failure
* Require older msgpack-numpy
* Restore encoding arg on msgpack-numpy
* Try to fix version pin for msgpack-numpy
* Update Portuguese Language (#2790)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols
* Extended punctuation and norm_exceptions in the Portuguese language
* Correct error in spacy universe docs concerning spacy-lookup (#2814)
* Update Keras Example for (Parikh et al, 2016) implementation (#2803)
* bug fixes in keras example
* created contributor agreement
* baseline for Parikh model
* initial version of parikh 2016 implemented
* tested asymmetric models
* fixed grevious error in normalization
* use standard SNLI test file
* begin to rework parikh example
* initial version of running example
* start to document the new version
* start to document the new version
* Update Decompositional Attention.ipynb
* fixed calls to similarity
* updated the README
* import sys package duh
* simplified indexing on mapping word to IDs
* stupid python indent error
* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround
* Fix typo (closes #2815) [ci skip]
* Update regex version dependency
* Set version to 2.0.13.dev3
* Skip seemingly problematic test
* Remove problematic test
* Try previous version of regex
* Revert "Remove problematic test"
This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.
* Unskip test
* Try older version of regex
* 💫 Update training examples and use minibatching (#2830)
<!--- Provide a general summary of your changes in the title. -->
## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.
### Types of change
enhancements
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Visual C++ link updated (#2842) (closes #2841) [ci skip]
* New landing page
* Add contribution agreement
* Correcting lang/ru/examples.py (#2845)
* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement
* Correct some grammatical inaccuracies in lang\ru\examples.py
* Move contributor agreement to separate file
* Set version to 2.0.13.dev4
* Add Persian(Farsi) language support (#2797)
* Also include lowercase norm exceptions
* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors
* Rule-based French Lemmatizer (#2818)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Set version to 2.0.13
* Fix formatting and consistency
* Update docs for new version [ci skip]
* Increment version [ci skip]
* Add info on wheels [ci skip]
* Adding "This is a sentence" example to Sinhala (#2846)
* Add wheels badge
* Update badge [ci skip]
* Update README.rst [ci skip]
* Update murmurhash pin
* Increment version to 2.0.14.dev0
* Update GPU docs for v2.0.14
* Add wheel to setup_requires
* Import prefer_gpu and require_gpu functions from Thinc
* Add tests for prefer_gpu() and require_gpu()
* Update requirements and setup.py
* Workaround bug in thinc require_gpu
* Set version to v2.0.14
* Update push-tag script
* Unhack prefer_gpu
* Require thinc 6.10.6
* Update prefer_gpu and require_gpu docs [ci skip]
* Fix specifiers for GPU
* Set version to 2.0.14.dev1
* Set version to 2.0.14
* Update Thinc version pin
* Increment version
* Fix msgpack-numpy version pin
* Increment version
* Update version to 2.0.16
* Update version [ci skip]
* Redundant ')' in the Stop words' example (#2856)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Documentation improvement regarding joblib and SO (#2867)
Some documentation improvements
## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)
### Types of change
Documentation
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* raise error when setting overlapping entities as doc.ents (#2880)
* Fix out-of-bounds access in NER training
The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!
This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.
* Change PyThaiNLP Url (#2876)
* Fix missing comma
* Add example showing a fix-up rule for space entities
* Set version to 2.0.17.dev0
* Update regex version
* Revert "Update regex version"
This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.
* Try setting older regex version, to align with conda
* Set version to 2.0.17
* Add spacy-js to universe [ci-skip]
* Add spacy-raspberry to universe (closes #2889)
* Add script to validate universe json [ci skip]
* Removed space in docs + added contributor indo (#2909)
* - removed unneeded space in documentation
* - added contributor info
* Allow input text of length up to max_length, inclusive (#2922)
* Include universe spec for spacy-wordnet component (#2919)
* feat: include universe spec for spacy-wordnet component
* chore: include spaCy contributor agreement
* Minor formatting changes [ci skip]
* Fix image [ci skip]
Twitter URL doesn't work on live site
* Check if the word is in one of the regular lists specific to each POS (#2886)
* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)
Resolves #2924.
## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)
### Types of change
bug fix
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix typo [ci skip]
* fixes symbolic link on py3 and windows (#2949)
* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948
* Update spacy/compat.py
Co-Authored-By: cicorias <cicorias@users.noreply.github.com>
* Fix formatting
* Update universe [ci skip]
* Catalan Language Support (#2940)
* Catalan language Support
* Ddding Catalan to documentation
* Sort languages alphabetically [ci skip]
* Update tests for pytest 4.x (#2965)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix regex pin to harmonize with conda (#2964)
* Update README.rst
* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)
Fixes #2976
* Fix typo
* Fix typo
* Remove duplicate file
* Require thinc 7.0.0.dev2
Fixes bug in gpu_ops that would use cupy instead of numpy on CPU
* Add missing import
* Fix error IDs
* Fix tests
2018-11-29 18:30:29 +03:00
|
|
|
|
start = ent_info.start
|
|
|
|
|
end = ent_info.end
|
|
|
|
|
elif len(ent_info) == 3:
|
|
|
|
|
ent_type, start, end = ent_info
|
2019-09-16 16:18:37 +03:00
|
|
|
|
ent_kb_id = 0
|
|
|
|
|
elif len(ent_info) == 4:
|
|
|
|
|
ent_type, ent_kb_id, start, end = ent_info
|
💫 Port master changes over to develop (#2979)
* Create aryaprabhudesai.md (#2681)
* Update _install.jade (#2688)
Typo fix: "models" -> "model"
* Add FAC to spacy.explain (resolves #2706)
* Remove docstrings for deprecated arguments (see #2703)
* When calling getoption() in conftest.py, pass a default option (#2709)
* When calling getoption() in conftest.py, pass a default option
This is necessary to allow testing an installed spacy by running:
pytest --pyargs spacy
* Add contributor agreement
* update bengali token rules for hyphen and digits (#2731)
* Less norm computations in token similarity (#2730)
* Less norm computations in token similarity
* Contributor agreement
* Remove ')' for clarity (#2737)
Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.
* added contributor agreement for mbkupfer (#2738)
* Basic support for Telugu language (#2751)
* Lex _attrs for polish language (#2750)
* Signed spaCy contributor agreement
* Added polish version of english lex_attrs
* Introduces a bulk merge function, in order to solve issue #653 (#2696)
* Fix comment
* Introduce bulk merge to increase performance on many span merges
* Sign contributor agreement
* Implement pull request suggestions
* Describe converters more explicitly (see #2643)
* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]
* Fix formatting
* Fix dependency scheme docs (closes #2705) [ci skip]
* Don't set stop word in example (closes #2657) [ci skip]
* Add words to portuguese language _num_words (#2759)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Update Indonesian model (#2752)
* adding e-KTP in tokenizer exceptions list
* add exception token
* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception
* add tokenizer exceptions list
* combining base_norms with norm_exceptions
* adding norm_exception
* fix double key in lemmatizer
* remove unused import on punctuation.py
* reformat stop_words to reduce number of lines, improve readibility
* updating tokenizer exception
* implement is_currency for lang/id
* adding orth_first_upper in tokenizer_exceptions
* update the norm_exception list
* remove bunch of abbreviations
* adding contributors file
* Fixed spaCy+Keras example (#2763)
* bug fixes in keras example
* created contributor agreement
* Adding French hyphenated first name (#2786)
* Fix typo (closes #2784)
* Fix typo (#2795) [ci skip]
Fixed typo on line 6 "regcognizer --> recognizer"
* Adding basic support for Sinhala language. (#2788)
* adding Sinhala language package, stop words, examples and lex_attrs.
* Adding contributor agreement
* Updating contributor agreement
* Also include lowercase norm exceptions
* Fix error (#2802)
* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way. Use the resize function
* added spaCy Contributor Agreement
* Add charlax's contributor agreement (#2805)
* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)
* Contributors agreement
* Contributors agreement
* Contributors agreement
* Add jupyter=True to displacy.render in documentation (#2806)
* Revert "Also include lowercase norm exceptions"
This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.
* Remove deprecated encoding argument to msgpack
* Set up dependency tree pattern matching skeleton (#2732)
* Fix bug when too many entity types. Fixes #2800
* Fix Python 2 test failure
* Require older msgpack-numpy
* Restore encoding arg on msgpack-numpy
* Try to fix version pin for msgpack-numpy
* Update Portuguese Language (#2790)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols
* Extended punctuation and norm_exceptions in the Portuguese language
* Correct error in spacy universe docs concerning spacy-lookup (#2814)
* Update Keras Example for (Parikh et al, 2016) implementation (#2803)
* bug fixes in keras example
* created contributor agreement
* baseline for Parikh model
* initial version of parikh 2016 implemented
* tested asymmetric models
* fixed grevious error in normalization
* use standard SNLI test file
* begin to rework parikh example
* initial version of running example
* start to document the new version
* start to document the new version
* Update Decompositional Attention.ipynb
* fixed calls to similarity
* updated the README
* import sys package duh
* simplified indexing on mapping word to IDs
* stupid python indent error
* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround
* Fix typo (closes #2815) [ci skip]
* Update regex version dependency
* Set version to 2.0.13.dev3
* Skip seemingly problematic test
* Remove problematic test
* Try previous version of regex
* Revert "Remove problematic test"
This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.
* Unskip test
* Try older version of regex
* 💫 Update training examples and use minibatching (#2830)
<!--- Provide a general summary of your changes in the title. -->
## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.
### Types of change
enhancements
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Visual C++ link updated (#2842) (closes #2841) [ci skip]
* New landing page
* Add contribution agreement
* Correcting lang/ru/examples.py (#2845)
* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement
* Correct some grammatical inaccuracies in lang\ru\examples.py
* Move contributor agreement to separate file
* Set version to 2.0.13.dev4
* Add Persian(Farsi) language support (#2797)
* Also include lowercase norm exceptions
* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors
* Rule-based French Lemmatizer (#2818)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Set version to 2.0.13
* Fix formatting and consistency
* Update docs for new version [ci skip]
* Increment version [ci skip]
* Add info on wheels [ci skip]
* Adding "This is a sentence" example to Sinhala (#2846)
* Add wheels badge
* Update badge [ci skip]
* Update README.rst [ci skip]
* Update murmurhash pin
* Increment version to 2.0.14.dev0
* Update GPU docs for v2.0.14
* Add wheel to setup_requires
* Import prefer_gpu and require_gpu functions from Thinc
* Add tests for prefer_gpu() and require_gpu()
* Update requirements and setup.py
* Workaround bug in thinc require_gpu
* Set version to v2.0.14
* Update push-tag script
* Unhack prefer_gpu
* Require thinc 6.10.6
* Update prefer_gpu and require_gpu docs [ci skip]
* Fix specifiers for GPU
* Set version to 2.0.14.dev1
* Set version to 2.0.14
* Update Thinc version pin
* Increment version
* Fix msgpack-numpy version pin
* Increment version
* Update version to 2.0.16
* Update version [ci skip]
* Redundant ')' in the Stop words' example (#2856)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Documentation improvement regarding joblib and SO (#2867)
Some documentation improvements
## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)
### Types of change
Documentation
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* raise error when setting overlapping entities as doc.ents (#2880)
* Fix out-of-bounds access in NER training
The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!
This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.
* Change PyThaiNLP Url (#2876)
* Fix missing comma
* Add example showing a fix-up rule for space entities
* Set version to 2.0.17.dev0
* Update regex version
* Revert "Update regex version"
This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.
* Try setting older regex version, to align with conda
* Set version to 2.0.17
* Add spacy-js to universe [ci-skip]
* Add spacy-raspberry to universe (closes #2889)
* Add script to validate universe json [ci skip]
* Removed space in docs + added contributor indo (#2909)
* - removed unneeded space in documentation
* - added contributor info
* Allow input text of length up to max_length, inclusive (#2922)
* Include universe spec for spacy-wordnet component (#2919)
* feat: include universe spec for spacy-wordnet component
* chore: include spaCy contributor agreement
* Minor formatting changes [ci skip]
* Fix image [ci skip]
Twitter URL doesn't work on live site
* Check if the word is in one of the regular lists specific to each POS (#2886)
* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)
Resolves #2924.
## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)
### Types of change
bug fix
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix typo [ci skip]
* fixes symbolic link on py3 and windows (#2949)
* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948
* Update spacy/compat.py
Co-Authored-By: cicorias <cicorias@users.noreply.github.com>
* Fix formatting
* Update universe [ci skip]
* Catalan Language Support (#2940)
* Catalan language Support
* Ddding Catalan to documentation
* Sort languages alphabetically [ci skip]
* Update tests for pytest 4.x (#2965)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix regex pin to harmonize with conda (#2964)
* Update README.rst
* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)
Fixes #2976
* Fix typo
* Fix typo
* Remove duplicate file
* Require thinc 7.0.0.dev2
Fixes bug in gpu_ops that would use cupy instead of numpy on CPU
* Add missing import
* Fix error IDs
* Fix tests
2018-11-29 18:30:29 +03:00
|
|
|
|
else:
|
2019-09-16 16:18:37 +03:00
|
|
|
|
ent_id, ent_kb_id, ent_type, start, end = ent_info
|
|
|
|
|
return ent_type, ent_kb_id, start, end
|