Merge branch 'master' into spacy.io

2025-12-06 17:54:21 +03:00 · 2019-10-23 14:41:38 +02:00 · 2019-10-23 14:41:38 +02:00 · e4820fa667
commit e4820fa667
parent 4909f478ce 7fc39f124c
14 changed files with 200 additions and 132 deletions
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -9,6 +9,11 @@ trigger:
    exclude:
    - 'website/*'
    - '*.md'
+pr:
+  paths:
+    exclude:
+    - 'website/*'
+    - '*.md'

 jobs:

--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "2.2.1"
+__version__ = "2.2.2.dev1"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -546,7 +546,7 @@ cdef class GoldParse:
    def __init__(self, doc, annot_tuples=None, words=None, tags=None, morphology=None,
                 heads=None, deps=None, entities=None, make_projective=False,
                 cats=None, links=None, **_):
-        """Create a GoldParse.
+        """Create a GoldParse. The fields will not be initialized if len(doc) is zero.

        doc (Doc): The document the annotations refer to.
        words (iterable): A sequence of unicode word strings.
@ -575,6 +575,15 @@ cdef class GoldParse:
            negative examples respectively.
        RETURNS (GoldParse): The newly constructed object.
        """
+        self.mem = Pool()
+        self.loss = 0
+        self.length = len(doc)
+
+        self.cats = {} if cats is None else dict(cats)
+        self.links = links
+
+        # avoid allocating memory if the doc does not contain any tokens
+        if self.length > 0:
            if words is None:
                words = [token.text for token in doc]
            if tags is None:
@ -596,9 +605,6 @@ cdef class GoldParse:
                if not isinstance(entities[0], basestring):
                    # Assume we have entities specified by character offset.
                    entities = biluo_tags_from_offsets(doc, entities)
-        self.mem = Pool()
-        self.loss = 0
-        self.length = len(doc)

            # These are filled by the tagger/parser/entity recogniser
            self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
@ -608,8 +614,6 @@ cdef class GoldParse:
            self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
            self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))

-        self.cats = {} if cats is None else dict(cats)
-        self.links = links
            self.words = [None] * len(doc)
            self.tags = [None] * len(doc)
            self.heads = [None] * len(doc)
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -254,6 +254,11 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None,
    cdef PatternStateC state
    cdef int i, j, nr_extra_attr
    cdef Pool mem = Pool()
+    output = []
+    if doc.length == 0:
+        # avoid any processing or mem alloc if the document is empty
+        return output
+    if len(predicates) > 0:
        predicate_cache = <char*>mem.alloc(doc.length * len(predicates), sizeof(char))
    if extensions is not None and len(extensions) >= 1:
        nr_extra_attr = max(extensions.values()) + 1
@ -278,7 +283,6 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None,
        predicate_cache += len(predicates)
    # Handle matches that end in 0-width patterns
    finish_states(matches, states)
-    output = []
    seen = set()
    for i in range(matches.size()):
        match = (
@ -560,11 +564,13 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs)
        for j, (attr, value) in enumerate(spec):
            pattern[i].attrs[j].attr = attr
            pattern[i].attrs[j].value = value
+        if len(extensions) > 0:
            pattern[i].extra_attrs = <IndexValueC*>mem.alloc(len(extensions), sizeof(IndexValueC))
        for j, (index, value) in enumerate(extensions):
            pattern[i].extra_attrs[j].index = index
            pattern[i].extra_attrs[j].value = value
        pattern[i].nr_extra_attr = len(extensions)
+        if len(predicates) > 0:
            pattern[i].py_predicates = <int32_t*>mem.alloc(len(predicates), sizeof(int32_t))
        for j, index in enumerate(predicates):
            pattern[i].py_predicates[j] = index
--- a/spacy/syntax/_parser_model.pxd
+++ b/spacy/syntax/_parser_model.pxd
@ -36,7 +36,9 @@ cdef WeightsC get_c_weights(model) except *

 cdef SizesC get_c_sizes(model, int batch_size) except *

-cdef void resize_activations(ActivationsC* A, SizesC n) nogil
+cdef ActivationsC alloc_activations(SizesC n) nogil
+
+cdef void free_activations(const ActivationsC* A) nogil

 cdef void predict_states(ActivationsC* A, StateC** states,
        const WeightsC* W, SizesC n) nogil
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@ -62,6 +62,21 @@ cdef SizesC get_c_sizes(model, int batch_size) except *:
    return output


+cdef ActivationsC alloc_activations(SizesC n) nogil:
+    cdef ActivationsC A
+    memset(&A, 0, sizeof(A))
+    resize_activations(&A, n)
+    return A
+
+
+cdef void free_activations(const ActivationsC* A) nogil:
+    free(A.token_ids)
+    free(A.scores)
+    free(A.unmaxed)
+    free(A.hiddens)
+    free(A.is_valid)
+
+
 cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
    if n.states <= A._max_size:
        A._curr_size = n.states
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -27,7 +27,8 @@ from thinc.neural.util import get_array_module
 from thinc.linalg cimport Vec, VecVec
 import srsly

-from ._parser_model cimport resize_activations, predict_states, arg_max_if_valid
+from ._parser_model cimport alloc_activations, free_activations
+from ._parser_model cimport predict_states, arg_max_if_valid
 from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
 from ._parser_model cimport get_c_weights, get_c_sizes
 from ._parser_model import ParserModel
@ -312,8 +313,7 @@ cdef class Parser:
            WeightsC weights, SizesC sizes) nogil:
        cdef int i, j
        cdef vector[StateC*] unfinished
-        cdef ActivationsC activations
-        memset(&activations, 0, sizeof(activations))
+        cdef ActivationsC activations = alloc_activations(sizes)
        while sizes.states >= 1:
            predict_states(&activations,
                states, &weights, sizes)
@ -327,6 +327,7 @@ cdef class Parser:
                states[i] = unfinished[i]
            sizes.states = unfinished.size()
            unfinished.clear()
+        free_activations(&activations)

    def set_annotations(self, docs, states_or_beams, tensors=None):
        cdef StateClass state
@ -363,6 +364,9 @@ cdef class Parser:

    cdef void c_transition_batch(self, StateC** states, const float* scores,
            int nr_class, int batch_size) nogil:
+        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
+        with gil:
+            assert self.moves.n_moves > 0
        is_valid = <int*>calloc(self.moves.n_moves, sizeof(int))
        cdef int i, guess
        cdef Transition action
@ -546,6 +550,10 @@ cdef class Parser:
        cdef GoldParse gold
        cdef Pool mem = Pool()
        cdef int i
+
+        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
+        assert self.moves.n_moves > 0
+
        is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
        costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
        cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@ -83,6 +83,8 @@ cdef class TransitionSystem:

    def get_oracle_sequence(self, doc, GoldParse gold):
        cdef Pool mem = Pool()
+        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
+        assert self.n_moves > 0
        costs = <float*>mem.alloc(self.n_moves, sizeof(float))
        is_valid = <int*>mem.alloc(self.n_moves, sizeof(int))

--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@ -141,6 +141,18 @@ def test_vectors_most_similar(most_similar_vectors_data):
    assert all(row[0] == i for i, row in enumerate(best_rows))


+def test_vectors_most_similar_identical():
+    """Test that most similar identical vectors are assigned a score of 1.0."""
+    data = numpy.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f")
+    v = Vectors(data=data, keys=["A", "B", "C"])
+    keys, _, scores = v.most_similar(numpy.asarray([[4, 2, 2, 2]], dtype="f"))
+    assert scores[0][0] == 1.0  # not 1.0000002
+    data = numpy.asarray([[1, 2, 3], [1, 2, 3], [1, 1, 1]], dtype="f")
+    v = Vectors(data=data, keys=["A", "B", "C"])
+    keys, _, scores = v.most_similar(numpy.asarray([[1, 2, 3]], dtype="f"))
+    assert scores[0][0] == 1.0  # not 0.9999999
+
+
@pytest.mark.parametrize("text", ["apple and orange"])
 def test_vectors_token_vector(tokenizer_v, vectors, text):
    doc = tokenizer_v(text)
@ -302,4 +314,4 @@ def test_vocab_prune_vectors():
    assert list(remap.keys()) == ["kitten"]
    neighbour, similarity = list(remap.values())[0]
    assert neighbour == "cat", remap
-    assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-6)
+    assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-4, rtol=1e-3)
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -331,6 +331,9 @@ cdef class Tokenizer:
    cdef int _save_cached(self, const TokenC* tokens, hash_t key,
                          int has_special, int n) except -1:
        cdef int i
+        if n <= 0:
+            # avoid mem alloc of zero length
+            return 0
        for i in range(n):
            if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
                return 0
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@ -157,6 +157,9 @@ def _merge(Doc doc, merges):
    cdef TokenC* token
    cdef Pool mem = Pool()
    cdef int merged_iob = 0
+
+    # merges should not be empty, but make sure to avoid zero-length mem alloc
+    assert len(merges) > 0
    tokens = <TokenC**>mem.alloc(len(merges), sizeof(TokenC))
    spans = []

--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -791,6 +791,8 @@ cdef class Doc:
        # Get set up for fast loading
        cdef Pool mem = Pool()
        cdef int n_attrs = len(attrs)
+        # attrs should not be empty, but make sure to avoid zero-length mem alloc
+        assert n_attrs > 0
        attr_ids = <attr_id_t*>mem.alloc(n_attrs, sizeof(attr_id_t))
        for i, attr_id in enumerate(attrs):
            attr_ids[i] = attr_id
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@ -346,6 +346,10 @@ cdef class Vectors:
                best_rows[i:i+batch_size] = best_rows[sorted_index]
        
        xp = get_array_module(self.data)
+        # Round values really close to 1 or -1
+        scores = xp.around(scores, decimals=4, out=scores)
+        # Account for numerical error we want to return in range -1, 1
+        scores = xp.clip(scores, a_min=-1, a_max=1, out=scores)
        row2key = {row: key for key, row in self.key2row.items()}
        keys = xp.asarray(
            [[row2key[row] for row in best_rows[i] if row in row2key] 
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@ -1137,6 +1137,8 @@ def expand_person_entities(doc):
                new_ents.append(new_ent)
            else:
                new_ents.append(ent)
+        else:
+            new_ents.append(ent)
    doc.ents = new_ents
    return doc
 ```