Merge branch 'master' into spacy.io

This commit is contained in:
Ines Montani 2019-10-23 14:41:38 +02:00
commit e4820fa667
14 changed files with 200 additions and 132 deletions

View File

@ -9,6 +9,11 @@ trigger:
exclude: exclude:
- 'website/*' - 'website/*'
- '*.md' - '*.md'
pr:
paths:
exclude:
- 'website/*'
- '*.md'
jobs: jobs:

View File

@ -1,6 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy" __title__ = "spacy"
__version__ = "2.2.1" __version__ = "2.2.2.dev1"
__release__ = True __release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -546,7 +546,7 @@ cdef class GoldParse:
def __init__(self, doc, annot_tuples=None, words=None, tags=None, morphology=None, def __init__(self, doc, annot_tuples=None, words=None, tags=None, morphology=None,
heads=None, deps=None, entities=None, make_projective=False, heads=None, deps=None, entities=None, make_projective=False,
cats=None, links=None, **_): cats=None, links=None, **_):
"""Create a GoldParse. """Create a GoldParse. The fields will not be initialized if len(doc) is zero.
doc (Doc): The document the annotations refer to. doc (Doc): The document the annotations refer to.
words (iterable): A sequence of unicode word strings. words (iterable): A sequence of unicode word strings.
@ -575,6 +575,15 @@ cdef class GoldParse:
negative examples respectively. negative examples respectively.
RETURNS (GoldParse): The newly constructed object. RETURNS (GoldParse): The newly constructed object.
""" """
self.mem = Pool()
self.loss = 0
self.length = len(doc)
self.cats = {} if cats is None else dict(cats)
self.links = links
# avoid allocating memory if the doc does not contain any tokens
if self.length > 0:
if words is None: if words is None:
words = [token.text for token in doc] words = [token.text for token in doc]
if tags is None: if tags is None:
@ -596,9 +605,6 @@ cdef class GoldParse:
if not isinstance(entities[0], basestring): if not isinstance(entities[0], basestring):
# Assume we have entities specified by character offset. # Assume we have entities specified by character offset.
entities = biluo_tags_from_offsets(doc, entities) entities = biluo_tags_from_offsets(doc, entities)
self.mem = Pool()
self.loss = 0
self.length = len(doc)
# These are filled by the tagger/parser/entity recogniser # These are filled by the tagger/parser/entity recogniser
self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int)) self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
@ -608,8 +614,6 @@ cdef class GoldParse:
self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int)) self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition)) self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
self.cats = {} if cats is None else dict(cats)
self.links = links
self.words = [None] * len(doc) self.words = [None] * len(doc)
self.tags = [None] * len(doc) self.tags = [None] * len(doc)
self.heads = [None] * len(doc) self.heads = [None] * len(doc)

View File

@ -254,6 +254,11 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None,
cdef PatternStateC state cdef PatternStateC state
cdef int i, j, nr_extra_attr cdef int i, j, nr_extra_attr
cdef Pool mem = Pool() cdef Pool mem = Pool()
output = []
if doc.length == 0:
# avoid any processing or mem alloc if the document is empty
return output
if len(predicates) > 0:
predicate_cache = <char*>mem.alloc(doc.length * len(predicates), sizeof(char)) predicate_cache = <char*>mem.alloc(doc.length * len(predicates), sizeof(char))
if extensions is not None and len(extensions) >= 1: if extensions is not None and len(extensions) >= 1:
nr_extra_attr = max(extensions.values()) + 1 nr_extra_attr = max(extensions.values()) + 1
@ -278,7 +283,6 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None,
predicate_cache += len(predicates) predicate_cache += len(predicates)
# Handle matches that end in 0-width patterns # Handle matches that end in 0-width patterns
finish_states(matches, states) finish_states(matches, states)
output = []
seen = set() seen = set()
for i in range(matches.size()): for i in range(matches.size()):
match = ( match = (
@ -560,11 +564,13 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs)
for j, (attr, value) in enumerate(spec): for j, (attr, value) in enumerate(spec):
pattern[i].attrs[j].attr = attr pattern[i].attrs[j].attr = attr
pattern[i].attrs[j].value = value pattern[i].attrs[j].value = value
if len(extensions) > 0:
pattern[i].extra_attrs = <IndexValueC*>mem.alloc(len(extensions), sizeof(IndexValueC)) pattern[i].extra_attrs = <IndexValueC*>mem.alloc(len(extensions), sizeof(IndexValueC))
for j, (index, value) in enumerate(extensions): for j, (index, value) in enumerate(extensions):
pattern[i].extra_attrs[j].index = index pattern[i].extra_attrs[j].index = index
pattern[i].extra_attrs[j].value = value pattern[i].extra_attrs[j].value = value
pattern[i].nr_extra_attr = len(extensions) pattern[i].nr_extra_attr = len(extensions)
if len(predicates) > 0:
pattern[i].py_predicates = <int32_t*>mem.alloc(len(predicates), sizeof(int32_t)) pattern[i].py_predicates = <int32_t*>mem.alloc(len(predicates), sizeof(int32_t))
for j, index in enumerate(predicates): for j, index in enumerate(predicates):
pattern[i].py_predicates[j] = index pattern[i].py_predicates[j] = index

View File

@ -36,7 +36,9 @@ cdef WeightsC get_c_weights(model) except *
cdef SizesC get_c_sizes(model, int batch_size) except * cdef SizesC get_c_sizes(model, int batch_size) except *
cdef void resize_activations(ActivationsC* A, SizesC n) nogil cdef ActivationsC alloc_activations(SizesC n) nogil
cdef void free_activations(const ActivationsC* A) nogil
cdef void predict_states(ActivationsC* A, StateC** states, cdef void predict_states(ActivationsC* A, StateC** states,
const WeightsC* W, SizesC n) nogil const WeightsC* W, SizesC n) nogil

View File

@ -62,6 +62,21 @@ cdef SizesC get_c_sizes(model, int batch_size) except *:
return output return output
cdef ActivationsC alloc_activations(SizesC n) nogil:
cdef ActivationsC A
memset(&A, 0, sizeof(A))
resize_activations(&A, n)
return A
cdef void free_activations(const ActivationsC* A) nogil:
free(A.token_ids)
free(A.scores)
free(A.unmaxed)
free(A.hiddens)
free(A.is_valid)
cdef void resize_activations(ActivationsC* A, SizesC n) nogil: cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
if n.states <= A._max_size: if n.states <= A._max_size:
A._curr_size = n.states A._curr_size = n.states

View File

@ -27,7 +27,8 @@ from thinc.neural.util import get_array_module
from thinc.linalg cimport Vec, VecVec from thinc.linalg cimport Vec, VecVec
import srsly import srsly
from ._parser_model cimport resize_activations, predict_states, arg_max_if_valid from ._parser_model cimport alloc_activations, free_activations
from ._parser_model cimport predict_states, arg_max_if_valid
from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
from ._parser_model cimport get_c_weights, get_c_sizes from ._parser_model cimport get_c_weights, get_c_sizes
from ._parser_model import ParserModel from ._parser_model import ParserModel
@ -312,8 +313,7 @@ cdef class Parser:
WeightsC weights, SizesC sizes) nogil: WeightsC weights, SizesC sizes) nogil:
cdef int i, j cdef int i, j
cdef vector[StateC*] unfinished cdef vector[StateC*] unfinished
cdef ActivationsC activations cdef ActivationsC activations = alloc_activations(sizes)
memset(&activations, 0, sizeof(activations))
while sizes.states >= 1: while sizes.states >= 1:
predict_states(&activations, predict_states(&activations,
states, &weights, sizes) states, &weights, sizes)
@ -327,6 +327,7 @@ cdef class Parser:
states[i] = unfinished[i] states[i] = unfinished[i]
sizes.states = unfinished.size() sizes.states = unfinished.size()
unfinished.clear() unfinished.clear()
free_activations(&activations)
def set_annotations(self, docs, states_or_beams, tensors=None): def set_annotations(self, docs, states_or_beams, tensors=None):
cdef StateClass state cdef StateClass state
@ -363,6 +364,9 @@ cdef class Parser:
cdef void c_transition_batch(self, StateC** states, const float* scores, cdef void c_transition_batch(self, StateC** states, const float* scores,
int nr_class, int batch_size) nogil: int nr_class, int batch_size) nogil:
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
with gil:
assert self.moves.n_moves > 0
is_valid = <int*>calloc(self.moves.n_moves, sizeof(int)) is_valid = <int*>calloc(self.moves.n_moves, sizeof(int))
cdef int i, guess cdef int i, guess
cdef Transition action cdef Transition action
@ -546,6 +550,10 @@ cdef class Parser:
cdef GoldParse gold cdef GoldParse gold
cdef Pool mem = Pool() cdef Pool mem = Pool()
cdef int i cdef int i
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
assert self.moves.n_moves > 0
is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int)) is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float)) costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves), cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),

View File

@ -83,6 +83,8 @@ cdef class TransitionSystem:
def get_oracle_sequence(self, doc, GoldParse gold): def get_oracle_sequence(self, doc, GoldParse gold):
cdef Pool mem = Pool() cdef Pool mem = Pool()
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
assert self.n_moves > 0
costs = <float*>mem.alloc(self.n_moves, sizeof(float)) costs = <float*>mem.alloc(self.n_moves, sizeof(float))
is_valid = <int*>mem.alloc(self.n_moves, sizeof(int)) is_valid = <int*>mem.alloc(self.n_moves, sizeof(int))

View File

@ -141,6 +141,18 @@ def test_vectors_most_similar(most_similar_vectors_data):
assert all(row[0] == i for i, row in enumerate(best_rows)) assert all(row[0] == i for i, row in enumerate(best_rows))
def test_vectors_most_similar_identical():
"""Test that most similar identical vectors are assigned a score of 1.0."""
data = numpy.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f")
v = Vectors(data=data, keys=["A", "B", "C"])
keys, _, scores = v.most_similar(numpy.asarray([[4, 2, 2, 2]], dtype="f"))
assert scores[0][0] == 1.0 # not 1.0000002
data = numpy.asarray([[1, 2, 3], [1, 2, 3], [1, 1, 1]], dtype="f")
v = Vectors(data=data, keys=["A", "B", "C"])
keys, _, scores = v.most_similar(numpy.asarray([[1, 2, 3]], dtype="f"))
assert scores[0][0] == 1.0 # not 0.9999999
@pytest.mark.parametrize("text", ["apple and orange"]) @pytest.mark.parametrize("text", ["apple and orange"])
def test_vectors_token_vector(tokenizer_v, vectors, text): def test_vectors_token_vector(tokenizer_v, vectors, text):
doc = tokenizer_v(text) doc = tokenizer_v(text)
@ -302,4 +314,4 @@ def test_vocab_prune_vectors():
assert list(remap.keys()) == ["kitten"] assert list(remap.keys()) == ["kitten"]
neighbour, similarity = list(remap.values())[0] neighbour, similarity = list(remap.values())[0]
assert neighbour == "cat", remap assert neighbour == "cat", remap
assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-6) assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-4, rtol=1e-3)

View File

@ -331,6 +331,9 @@ cdef class Tokenizer:
cdef int _save_cached(self, const TokenC* tokens, hash_t key, cdef int _save_cached(self, const TokenC* tokens, hash_t key,
int has_special, int n) except -1: int has_special, int n) except -1:
cdef int i cdef int i
if n <= 0:
# avoid mem alloc of zero length
return 0
for i in range(n): for i in range(n):
if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL: if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
return 0 return 0

View File

@ -157,6 +157,9 @@ def _merge(Doc doc, merges):
cdef TokenC* token cdef TokenC* token
cdef Pool mem = Pool() cdef Pool mem = Pool()
cdef int merged_iob = 0 cdef int merged_iob = 0
# merges should not be empty, but make sure to avoid zero-length mem alloc
assert len(merges) > 0
tokens = <TokenC**>mem.alloc(len(merges), sizeof(TokenC)) tokens = <TokenC**>mem.alloc(len(merges), sizeof(TokenC))
spans = [] spans = []

View File

@ -791,6 +791,8 @@ cdef class Doc:
# Get set up for fast loading # Get set up for fast loading
cdef Pool mem = Pool() cdef Pool mem = Pool()
cdef int n_attrs = len(attrs) cdef int n_attrs = len(attrs)
# attrs should not be empty, but make sure to avoid zero-length mem alloc
assert n_attrs > 0
attr_ids = <attr_id_t*>mem.alloc(n_attrs, sizeof(attr_id_t)) attr_ids = <attr_id_t*>mem.alloc(n_attrs, sizeof(attr_id_t))
for i, attr_id in enumerate(attrs): for i, attr_id in enumerate(attrs):
attr_ids[i] = attr_id attr_ids[i] = attr_id

View File

@ -346,6 +346,10 @@ cdef class Vectors:
best_rows[i:i+batch_size] = best_rows[sorted_index] best_rows[i:i+batch_size] = best_rows[sorted_index]
xp = get_array_module(self.data) xp = get_array_module(self.data)
# Round values really close to 1 or -1
scores = xp.around(scores, decimals=4, out=scores)
# Account for numerical error we want to return in range -1, 1
scores = xp.clip(scores, a_min=-1, a_max=1, out=scores)
row2key = {row: key for key, row in self.key2row.items()} row2key = {row: key for key, row in self.key2row.items()}
keys = xp.asarray( keys = xp.asarray(
[[row2key[row] for row in best_rows[i] if row in row2key] [[row2key[row] for row in best_rows[i] if row in row2key]

View File

@ -1137,6 +1137,8 @@ def expand_person_entities(doc):
new_ents.append(new_ent) new_ents.append(new_ent)
else: else:
new_ents.append(ent) new_ents.append(ent)
else:
new_ents.append(ent)
doc.ents = new_ents doc.ents = new_ents
return doc return doc
``` ```