diff --git a/azure-pipelines.yml b/azure-pipelines.yml index c23995de6..512c6414c 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -9,6 +9,11 @@ trigger: exclude: - 'website/*' - '*.md' +pr: + paths: + exclude: + - 'website/*' + - '*.md' jobs: diff --git a/spacy/about.py b/spacy/about.py index 7834bfd12..086e53242 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "2.2.1" +__version__ = "2.2.2.dev1" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 990440f59..7bf89c84a 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -546,7 +546,7 @@ cdef class GoldParse: def __init__(self, doc, annot_tuples=None, words=None, tags=None, morphology=None, heads=None, deps=None, entities=None, make_projective=False, cats=None, links=None, **_): - """Create a GoldParse. + """Create a GoldParse. The fields will not be initialized if len(doc) is zero. doc (Doc): The document the annotations refer to. words (iterable): A sequence of unicode word strings. @@ -575,138 +575,142 @@ cdef class GoldParse: negative examples respectively. RETURNS (GoldParse): The newly constructed object. """ - if words is None: - words = [token.text for token in doc] - if tags is None: - tags = [None for _ in words] - if heads is None: - heads = [None for _ in words] - if deps is None: - deps = [None for _ in words] - if morphology is None: - morphology = [None for _ in words] - if entities is None: - entities = ["-" for _ in doc] - elif len(entities) == 0: - entities = ["O" for _ in doc] - else: - # Translate the None values to '-', to make processing easier. - # See Issue #2603 - entities = [(ent if ent is not None else "-") for ent in entities] - if not isinstance(entities[0], basestring): - # Assume we have entities specified by character offset. - entities = biluo_tags_from_offsets(doc, entities) self.mem = Pool() self.loss = 0 self.length = len(doc) - # These are filled by the tagger/parser/entity recogniser - self.c.tags = self.mem.alloc(len(doc), sizeof(int)) - self.c.heads = self.mem.alloc(len(doc), sizeof(int)) - self.c.labels = self.mem.alloc(len(doc), sizeof(attr_t)) - self.c.has_dep = self.mem.alloc(len(doc), sizeof(int)) - self.c.sent_start = self.mem.alloc(len(doc), sizeof(int)) - self.c.ner = self.mem.alloc(len(doc), sizeof(Transition)) - self.cats = {} if cats is None else dict(cats) self.links = links - self.words = [None] * len(doc) - self.tags = [None] * len(doc) - self.heads = [None] * len(doc) - self.labels = [None] * len(doc) - self.ner = [None] * len(doc) - self.morphology = [None] * len(doc) - # This needs to be done before we align the words - if make_projective and heads is not None and deps is not None: - heads, deps = nonproj.projectivize(heads, deps) - - # Do many-to-one alignment for misaligned tokens. - # If we over-segment, we'll have one gold word that covers a sequence - # of predicted words - # If we under-segment, we'll have one predicted word that covers a - # sequence of gold words. - # If we "mis-segment", we'll have a sequence of predicted words covering - # a sequence of gold words. That's many-to-many -- we don't do that. - cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words) - - self.cand_to_gold = [(j if j >= 0 else None) for j in i2j] - self.gold_to_cand = [(i if i >= 0 else None) for i in j2i] - - annot_tuples = (range(len(words)), words, tags, heads, deps, entities) - self.orig_annot = list(zip(*annot_tuples)) - - for i, gold_i in enumerate(self.cand_to_gold): - if doc[i].text.isspace(): - self.words[i] = doc[i].text - self.tags[i] = "_SP" - self.heads[i] = None - self.labels[i] = None - self.ner[i] = None - self.morphology[i] = set() - if gold_i is None: - if i in i2j_multi: - self.words[i] = words[i2j_multi[i]] - self.tags[i] = tags[i2j_multi[i]] - self.morphology[i] = morphology[i2j_multi[i]] - is_last = i2j_multi[i] != i2j_multi.get(i+1) - is_first = i2j_multi[i] != i2j_multi.get(i-1) - # Set next word in multi-token span as head, until last - if not is_last: - self.heads[i] = i+1 - self.labels[i] = "subtok" - else: - self.heads[i] = self.gold_to_cand[heads[i2j_multi[i]]] - self.labels[i] = deps[i2j_multi[i]] - # Now set NER...This is annoying because if we've split - # got an entity word split into two, we need to adjust the - # BILUO tags. We can't have BB or LL etc. - # Case 1: O -- easy. - ner_tag = entities[i2j_multi[i]] - if ner_tag == "O": - self.ner[i] = "O" - # Case 2: U. This has to become a B I* L sequence. - elif ner_tag.startswith("U-"): - if is_first: - self.ner[i] = ner_tag.replace("U-", "B-", 1) - elif is_last: - self.ner[i] = ner_tag.replace("U-", "L-", 1) - else: - self.ner[i] = ner_tag.replace("U-", "I-", 1) - # Case 3: L. If not last, change to I. - elif ner_tag.startswith("L-"): - if is_last: - self.ner[i] = ner_tag - else: - self.ner[i] = ner_tag.replace("L-", "I-", 1) - # Case 4: I. Stays correct - elif ner_tag.startswith("I-"): - self.ner[i] = ner_tag + # avoid allocating memory if the doc does not contain any tokens + if self.length > 0: + if words is None: + words = [token.text for token in doc] + if tags is None: + tags = [None for _ in words] + if heads is None: + heads = [None for _ in words] + if deps is None: + deps = [None for _ in words] + if morphology is None: + morphology = [None for _ in words] + if entities is None: + entities = ["-" for _ in doc] + elif len(entities) == 0: + entities = ["O" for _ in doc] else: - self.words[i] = words[gold_i] - self.tags[i] = tags[gold_i] - self.morphology[i] = morphology[gold_i] - if heads[gold_i] is None: + # Translate the None values to '-', to make processing easier. + # See Issue #2603 + entities = [(ent if ent is not None else "-") for ent in entities] + if not isinstance(entities[0], basestring): + # Assume we have entities specified by character offset. + entities = biluo_tags_from_offsets(doc, entities) + + # These are filled by the tagger/parser/entity recogniser + self.c.tags = self.mem.alloc(len(doc), sizeof(int)) + self.c.heads = self.mem.alloc(len(doc), sizeof(int)) + self.c.labels = self.mem.alloc(len(doc), sizeof(attr_t)) + self.c.has_dep = self.mem.alloc(len(doc), sizeof(int)) + self.c.sent_start = self.mem.alloc(len(doc), sizeof(int)) + self.c.ner = self.mem.alloc(len(doc), sizeof(Transition)) + + self.words = [None] * len(doc) + self.tags = [None] * len(doc) + self.heads = [None] * len(doc) + self.labels = [None] * len(doc) + self.ner = [None] * len(doc) + self.morphology = [None] * len(doc) + + # This needs to be done before we align the words + if make_projective and heads is not None and deps is not None: + heads, deps = nonproj.projectivize(heads, deps) + + # Do many-to-one alignment for misaligned tokens. + # If we over-segment, we'll have one gold word that covers a sequence + # of predicted words + # If we under-segment, we'll have one predicted word that covers a + # sequence of gold words. + # If we "mis-segment", we'll have a sequence of predicted words covering + # a sequence of gold words. That's many-to-many -- we don't do that. + cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words) + + self.cand_to_gold = [(j if j >= 0 else None) for j in i2j] + self.gold_to_cand = [(i if i >= 0 else None) for i in j2i] + + annot_tuples = (range(len(words)), words, tags, heads, deps, entities) + self.orig_annot = list(zip(*annot_tuples)) + + for i, gold_i in enumerate(self.cand_to_gold): + if doc[i].text.isspace(): + self.words[i] = doc[i].text + self.tags[i] = "_SP" self.heads[i] = None + self.labels[i] = None + self.ner[i] = None + self.morphology[i] = set() + if gold_i is None: + if i in i2j_multi: + self.words[i] = words[i2j_multi[i]] + self.tags[i] = tags[i2j_multi[i]] + self.morphology[i] = morphology[i2j_multi[i]] + is_last = i2j_multi[i] != i2j_multi.get(i+1) + is_first = i2j_multi[i] != i2j_multi.get(i-1) + # Set next word in multi-token span as head, until last + if not is_last: + self.heads[i] = i+1 + self.labels[i] = "subtok" + else: + self.heads[i] = self.gold_to_cand[heads[i2j_multi[i]]] + self.labels[i] = deps[i2j_multi[i]] + # Now set NER...This is annoying because if we've split + # got an entity word split into two, we need to adjust the + # BILUO tags. We can't have BB or LL etc. + # Case 1: O -- easy. + ner_tag = entities[i2j_multi[i]] + if ner_tag == "O": + self.ner[i] = "O" + # Case 2: U. This has to become a B I* L sequence. + elif ner_tag.startswith("U-"): + if is_first: + self.ner[i] = ner_tag.replace("U-", "B-", 1) + elif is_last: + self.ner[i] = ner_tag.replace("U-", "L-", 1) + else: + self.ner[i] = ner_tag.replace("U-", "I-", 1) + # Case 3: L. If not last, change to I. + elif ner_tag.startswith("L-"): + if is_last: + self.ner[i] = ner_tag + else: + self.ner[i] = ner_tag.replace("L-", "I-", 1) + # Case 4: I. Stays correct + elif ner_tag.startswith("I-"): + self.ner[i] = ner_tag else: - self.heads[i] = self.gold_to_cand[heads[gold_i]] - self.labels[i] = deps[gold_i] - self.ner[i] = entities[gold_i] + self.words[i] = words[gold_i] + self.tags[i] = tags[gold_i] + self.morphology[i] = morphology[gold_i] + if heads[gold_i] is None: + self.heads[i] = None + else: + self.heads[i] = self.gold_to_cand[heads[gold_i]] + self.labels[i] = deps[gold_i] + self.ner[i] = entities[gold_i] - # Prevent whitespace that isn't within entities from being tagged as - # an entity. - for i in range(len(self.ner)): - if self.tags[i] == "_SP": - prev_ner = self.ner[i-1] if i >= 1 else None - next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None - if prev_ner == "O" or next_ner == "O": - self.ner[i] = "O" + # Prevent whitespace that isn't within entities from being tagged as + # an entity. + for i in range(len(self.ner)): + if self.tags[i] == "_SP": + prev_ner = self.ner[i-1] if i >= 1 else None + next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None + if prev_ner == "O" or next_ner == "O": + self.ner[i] = "O" - cycle = nonproj.contains_cycle(self.heads) - if cycle is not None: - raise ValueError(Errors.E069.format(cycle=cycle, - cycle_tokens=" ".join(["'{}'".format(self.words[tok_id]) for tok_id in cycle]), - doc_tokens=" ".join(words[:50]))) + cycle = nonproj.contains_cycle(self.heads) + if cycle is not None: + raise ValueError(Errors.E069.format(cycle=cycle, + cycle_tokens=" ".join(["'{}'".format(self.words[tok_id]) for tok_id in cycle]), + doc_tokens=" ".join(words[:50]))) def __len__(self): """Get the number of gold-standard tokens. diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 5dd6eab77..af0450592 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -254,7 +254,12 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None, cdef PatternStateC state cdef int i, j, nr_extra_attr cdef Pool mem = Pool() - predicate_cache = mem.alloc(doc.length * len(predicates), sizeof(char)) + output = [] + if doc.length == 0: + # avoid any processing or mem alloc if the document is empty + return output + if len(predicates) > 0: + predicate_cache = mem.alloc(doc.length * len(predicates), sizeof(char)) if extensions is not None and len(extensions) >= 1: nr_extra_attr = max(extensions.values()) + 1 extra_attr_values = mem.alloc(doc.length * nr_extra_attr, sizeof(attr_t)) @@ -278,7 +283,6 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None, predicate_cache += len(predicates) # Handle matches that end in 0-width patterns finish_states(matches, states) - output = [] seen = set() for i in range(matches.size()): match = ( @@ -560,12 +564,14 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs) for j, (attr, value) in enumerate(spec): pattern[i].attrs[j].attr = attr pattern[i].attrs[j].value = value - pattern[i].extra_attrs = mem.alloc(len(extensions), sizeof(IndexValueC)) + if len(extensions) > 0: + pattern[i].extra_attrs = mem.alloc(len(extensions), sizeof(IndexValueC)) for j, (index, value) in enumerate(extensions): pattern[i].extra_attrs[j].index = index pattern[i].extra_attrs[j].value = value pattern[i].nr_extra_attr = len(extensions) - pattern[i].py_predicates = mem.alloc(len(predicates), sizeof(int32_t)) + if len(predicates) > 0: + pattern[i].py_predicates = mem.alloc(len(predicates), sizeof(int32_t)) for j, index in enumerate(predicates): pattern[i].py_predicates[j] = index pattern[i].nr_py = len(predicates) diff --git a/spacy/syntax/_parser_model.pxd b/spacy/syntax/_parser_model.pxd index 5aec986d2..9c72f3415 100644 --- a/spacy/syntax/_parser_model.pxd +++ b/spacy/syntax/_parser_model.pxd @@ -36,7 +36,9 @@ cdef WeightsC get_c_weights(model) except * cdef SizesC get_c_sizes(model, int batch_size) except * -cdef void resize_activations(ActivationsC* A, SizesC n) nogil +cdef ActivationsC alloc_activations(SizesC n) nogil + +cdef void free_activations(const ActivationsC* A) nogil cdef void predict_states(ActivationsC* A, StateC** states, const WeightsC* W, SizesC n) nogil diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx index 841e33432..ce3dcbfa5 100644 --- a/spacy/syntax/_parser_model.pyx +++ b/spacy/syntax/_parser_model.pyx @@ -62,6 +62,21 @@ cdef SizesC get_c_sizes(model, int batch_size) except *: return output +cdef ActivationsC alloc_activations(SizesC n) nogil: + cdef ActivationsC A + memset(&A, 0, sizeof(A)) + resize_activations(&A, n) + return A + + +cdef void free_activations(const ActivationsC* A) nogil: + free(A.token_ids) + free(A.scores) + free(A.unmaxed) + free(A.hiddens) + free(A.is_valid) + + cdef void resize_activations(ActivationsC* A, SizesC n) nogil: if n.states <= A._max_size: A._curr_size = n.states diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index aeb4a5306..dd19b0e43 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -27,7 +27,8 @@ from thinc.neural.util import get_array_module from thinc.linalg cimport Vec, VecVec import srsly -from ._parser_model cimport resize_activations, predict_states, arg_max_if_valid +from ._parser_model cimport alloc_activations, free_activations +from ._parser_model cimport predict_states, arg_max_if_valid from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss from ._parser_model cimport get_c_weights, get_c_sizes from ._parser_model import ParserModel @@ -312,8 +313,7 @@ cdef class Parser: WeightsC weights, SizesC sizes) nogil: cdef int i, j cdef vector[StateC*] unfinished - cdef ActivationsC activations - memset(&activations, 0, sizeof(activations)) + cdef ActivationsC activations = alloc_activations(sizes) while sizes.states >= 1: predict_states(&activations, states, &weights, sizes) @@ -327,6 +327,7 @@ cdef class Parser: states[i] = unfinished[i] sizes.states = unfinished.size() unfinished.clear() + free_activations(&activations) def set_annotations(self, docs, states_or_beams, tensors=None): cdef StateClass state @@ -363,6 +364,9 @@ cdef class Parser: cdef void c_transition_batch(self, StateC** states, const float* scores, int nr_class, int batch_size) nogil: + # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc + with gil: + assert self.moves.n_moves > 0 is_valid = calloc(self.moves.n_moves, sizeof(int)) cdef int i, guess cdef Transition action @@ -546,6 +550,10 @@ cdef class Parser: cdef GoldParse gold cdef Pool mem = Pool() cdef int i + + # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc + assert self.moves.n_moves > 0 + is_valid = mem.alloc(self.moves.n_moves, sizeof(int)) costs = mem.alloc(self.moves.n_moves, sizeof(float)) cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves), diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 58b3a6993..7876813e0 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -83,6 +83,8 @@ cdef class TransitionSystem: def get_oracle_sequence(self, doc, GoldParse gold): cdef Pool mem = Pool() + # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc + assert self.n_moves > 0 costs = mem.alloc(self.n_moves, sizeof(float)) is_valid = mem.alloc(self.n_moves, sizeof(int)) diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index 4705f8e77..b688ab9dd 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -141,6 +141,18 @@ def test_vectors_most_similar(most_similar_vectors_data): assert all(row[0] == i for i, row in enumerate(best_rows)) +def test_vectors_most_similar_identical(): + """Test that most similar identical vectors are assigned a score of 1.0.""" + data = numpy.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f") + v = Vectors(data=data, keys=["A", "B", "C"]) + keys, _, scores = v.most_similar(numpy.asarray([[4, 2, 2, 2]], dtype="f")) + assert scores[0][0] == 1.0 # not 1.0000002 + data = numpy.asarray([[1, 2, 3], [1, 2, 3], [1, 1, 1]], dtype="f") + v = Vectors(data=data, keys=["A", "B", "C"]) + keys, _, scores = v.most_similar(numpy.asarray([[1, 2, 3]], dtype="f")) + assert scores[0][0] == 1.0 # not 0.9999999 + + @pytest.mark.parametrize("text", ["apple and orange"]) def test_vectors_token_vector(tokenizer_v, vectors, text): doc = tokenizer_v(text) @@ -302,4 +314,4 @@ def test_vocab_prune_vectors(): assert list(remap.keys()) == ["kitten"] neighbour, similarity = list(remap.values())[0] assert neighbour == "cat", remap - assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-6) + assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-4, rtol=1e-3) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index cdfa55dcb..b39bb1ecb 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -331,6 +331,9 @@ cdef class Tokenizer: cdef int _save_cached(self, const TokenC* tokens, hash_t key, int has_special, int n) except -1: cdef int i + if n <= 0: + # avoid mem alloc of zero length + return 0 for i in range(n): if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL: return 0 diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index f8b13dd78..5f890de45 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -157,6 +157,9 @@ def _merge(Doc doc, merges): cdef TokenC* token cdef Pool mem = Pool() cdef int merged_iob = 0 + + # merges should not be empty, but make sure to avoid zero-length mem alloc + assert len(merges) > 0 tokens = mem.alloc(len(merges), sizeof(TokenC)) spans = [] diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 80a808bae..6afe89e05 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -791,6 +791,8 @@ cdef class Doc: # Get set up for fast loading cdef Pool mem = Pool() cdef int n_attrs = len(attrs) + # attrs should not be empty, but make sure to avoid zero-length mem alloc + assert n_attrs > 0 attr_ids = mem.alloc(n_attrs, sizeof(attr_id_t)) for i, attr_id in enumerate(attrs): attr_ids[i] = attr_id diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 0f015521a..44dddb30c 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -344,8 +344,12 @@ cdef class Vectors: sorted_index = xp.arange(scores.shape[0])[:,None][i:i+batch_size],xp.argsort(scores[i:i+batch_size], axis=1)[:,::-1] scores[i:i+batch_size] = scores[sorted_index] best_rows[i:i+batch_size] = best_rows[sorted_index] - + xp = get_array_module(self.data) + # Round values really close to 1 or -1 + scores = xp.around(scores, decimals=4, out=scores) + # Account for numerical error we want to return in range -1, 1 + scores = xp.clip(scores, a_min=-1, a_max=1, out=scores) row2key = {row: key for key, row in self.key2row.items()} keys = xp.asarray( [[row2key[row] for row in best_rows[i] if row in row2key] diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index fe8e4e2d2..a375f416c 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -1135,6 +1135,8 @@ def expand_person_entities(doc): if prev_token.text in ("Dr", "Dr.", "Mr", "Mr.", "Ms", "Ms."): new_ent = Span(doc, ent.start - 1, ent.end, label=ent.label) new_ents.append(new_ent) + else: + new_ents.append(ent) else: new_ents.append(ent) doc.ents = new_ents