From 3dfc76457709818fd3675b727d34e056aa6d434c Mon Sep 17 00:00:00 2001
From: adrianeboyd <adrianeboyd@gmail.com>
Date: Tue, 22 Oct 2019 15:06:44 +0200
Subject: [PATCH 1/8] Free pointers in parser activations (#4486)

* Free pointers in ActivationsC

* Restructure alloc/free for parser activations

* Rewrite/restructure to have allocation and free in parallel functions
in `_parser_model` rather than partially in `_parseC()` in `Parser`.

* Remove `resize_activations` from `_parser_model.pxd`.
---
 spacy/syntax/_parser_model.pxd |  4 +++-
 spacy/syntax/_parser_model.pyx | 15 +++++++++++++++
 spacy/syntax/nn_parser.pyx     |  7 ++++---
 3 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/spacy/syntax/_parser_model.pxd b/spacy/syntax/_parser_model.pxd
index 5aec986d2..9c72f3415 100644
--- a/spacy/syntax/_parser_model.pxd
+++ b/spacy/syntax/_parser_model.pxd
@@ -36,7 +36,9 @@ cdef WeightsC get_c_weights(model) except *
 
 cdef SizesC get_c_sizes(model, int batch_size) except *
 
-cdef void resize_activations(ActivationsC* A, SizesC n) nogil
+cdef ActivationsC alloc_activations(SizesC n) nogil
+
+cdef void free_activations(const ActivationsC* A) nogil
 
 cdef void predict_states(ActivationsC* A, StateC** states,
         const WeightsC* W, SizesC n) nogil
diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx
index 841e33432..ce3dcbfa5 100644
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@@ -62,6 +62,21 @@ cdef SizesC get_c_sizes(model, int batch_size) except *:
     return output
 
 
+cdef ActivationsC alloc_activations(SizesC n) nogil:
+    cdef ActivationsC A
+    memset(&A, 0, sizeof(A))
+    resize_activations(&A, n)
+    return A
+
+
+cdef void free_activations(const ActivationsC* A) nogil:
+    free(A.token_ids)
+    free(A.scores)
+    free(A.unmaxed)
+    free(A.hiddens)
+    free(A.is_valid)
+
+
 cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
     if n.states <= A._max_size:
         A._curr_size = n.states
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index aeb4a5306..55b9c628b 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -27,7 +27,8 @@ from thinc.neural.util import get_array_module
 from thinc.linalg cimport Vec, VecVec
 import srsly
 
-from ._parser_model cimport resize_activations, predict_states, arg_max_if_valid
+from ._parser_model cimport alloc_activations, free_activations
+from ._parser_model cimport predict_states, arg_max_if_valid
 from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
 from ._parser_model cimport get_c_weights, get_c_sizes
 from ._parser_model import ParserModel
@@ -312,8 +313,7 @@ cdef class Parser:
             WeightsC weights, SizesC sizes) nogil:
         cdef int i, j
         cdef vector[StateC*] unfinished
-        cdef ActivationsC activations
-        memset(&activations, 0, sizeof(activations))
+        cdef ActivationsC activations = alloc_activations(sizes)
         while sizes.states >= 1:
             predict_states(&activations,
                 states, &weights, sizes)
@@ -327,6 +327,7 @@ cdef class Parser:
                 states[i] = unfinished[i]
             sizes.states = unfinished.size()
             unfinished.clear()
+        free_activations(&activations)
 
     def set_annotations(self, docs, states_or_beams, tensors=None):
         cdef StateClass state

From 48886afc789806cf461b625f7284da06d7e33785 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 22 Oct 2019 16:54:33 +0200
Subject: [PATCH 2/8] prevent zero-length mem alloc (#4429)

* raise specific error when removing a matcher rule that doesn't exist

* rephrasing

* goldparse init: allocate fields only if doc is not empty

* avoid zero length alloc in saving tokenizer cache

* avoid allocating zero length mem in matcher

* asserts to avoid allocating zero length mem

* fix zero-length allocation in matcher

* bump cymem version

* revert cymem version bump
---
 spacy/gold.pyx                     | 246 +++++++++++++++--------------
 spacy/matcher/matcher.pyx          |  14 +-
 spacy/syntax/nn_parser.pyx         |   7 +
 spacy/syntax/transition_system.pyx |   2 +
 spacy/tokenizer.pyx                |   3 +
 spacy/tokens/_retokenize.pyx       |   3 +
 spacy/tokens/doc.pyx               |   2 +
 7 files changed, 152 insertions(+), 125 deletions(-)

diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 990440f59..7bf89c84a 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -546,7 +546,7 @@ cdef class GoldParse:
     def __init__(self, doc, annot_tuples=None, words=None, tags=None, morphology=None,
                  heads=None, deps=None, entities=None, make_projective=False,
                  cats=None, links=None, **_):
-        """Create a GoldParse.
+        """Create a GoldParse. The fields will not be initialized if len(doc) is zero.
 
         doc (Doc): The document the annotations refer to.
         words (iterable): A sequence of unicode word strings.
@@ -575,138 +575,142 @@ cdef class GoldParse:
             negative examples respectively.
         RETURNS (GoldParse): The newly constructed object.
         """
-        if words is None:
-            words = [token.text for token in doc]
-        if tags is None:
-            tags = [None for _ in words]
-        if heads is None:
-            heads = [None for _ in words]
-        if deps is None:
-            deps = [None for _ in words]
-        if morphology is None:
-            morphology = [None for _ in words]
-        if entities is None:
-            entities = ["-" for _ in doc]
-        elif len(entities) == 0:
-            entities = ["O" for _ in doc]
-        else:
-            # Translate the None values to '-', to make processing easier.
-            # See Issue #2603
-            entities = [(ent if ent is not None else "-") for ent in entities]
-            if not isinstance(entities[0], basestring):
-                # Assume we have entities specified by character offset.
-                entities = biluo_tags_from_offsets(doc, entities)
         self.mem = Pool()
         self.loss = 0
         self.length = len(doc)
 
-        # These are filled by the tagger/parser/entity recogniser
-        self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
-        self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
-        self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
-        self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
-        self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
-        self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
-
         self.cats = {} if cats is None else dict(cats)
         self.links = links
-        self.words = [None] * len(doc)
-        self.tags = [None] * len(doc)
-        self.heads = [None] * len(doc)
-        self.labels = [None] * len(doc)
-        self.ner = [None] * len(doc)
-        self.morphology = [None] * len(doc)
 
-        # This needs to be done before we align the words
-        if make_projective and heads is not None and deps is not None:
-            heads, deps = nonproj.projectivize(heads, deps)
-
-        # Do many-to-one alignment for misaligned tokens.
-        # If we over-segment, we'll have one gold word that covers a sequence
-        # of predicted words
-        # If we under-segment, we'll have one predicted word that covers a
-        # sequence of gold words.
-        # If we "mis-segment", we'll have a sequence of predicted words covering
-        # a sequence of gold words. That's many-to-many -- we don't do that.
-        cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)
-
-        self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
-        self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
-
-        annot_tuples = (range(len(words)), words, tags, heads, deps, entities)
-        self.orig_annot = list(zip(*annot_tuples))
-
-        for i, gold_i in enumerate(self.cand_to_gold):
-            if doc[i].text.isspace():
-                self.words[i] = doc[i].text
-                self.tags[i] = "_SP"
-                self.heads[i] = None
-                self.labels[i] = None
-                self.ner[i] = None
-                self.morphology[i] = set()
-            if gold_i is None:
-                if i in i2j_multi:
-                    self.words[i] = words[i2j_multi[i]]
-                    self.tags[i] = tags[i2j_multi[i]]
-                    self.morphology[i] = morphology[i2j_multi[i]]
-                    is_last = i2j_multi[i] != i2j_multi.get(i+1)
-                    is_first = i2j_multi[i] != i2j_multi.get(i-1)
-                    # Set next word in multi-token span as head, until last
-                    if not is_last:
-                        self.heads[i] = i+1
-                        self.labels[i] = "subtok"
-                    else:
-                        self.heads[i] = self.gold_to_cand[heads[i2j_multi[i]]]
-                        self.labels[i] = deps[i2j_multi[i]]
-                    # Now set NER...This is annoying because if we've split
-                    # got an entity word split into two, we need to adjust the
-                    # BILUO tags. We can't have BB or LL etc.
-                    # Case 1: O -- easy.
-                    ner_tag = entities[i2j_multi[i]]
-                    if ner_tag == "O":
-                        self.ner[i] = "O"
-                    # Case 2: U. This has to become a B I* L sequence.
-                    elif ner_tag.startswith("U-"):
-                        if is_first:
-                            self.ner[i] = ner_tag.replace("U-", "B-", 1)
-                        elif is_last:
-                            self.ner[i] = ner_tag.replace("U-", "L-", 1)
-                        else:
-                            self.ner[i] = ner_tag.replace("U-", "I-", 1)
-                    # Case 3: L. If not last, change to I.
-                    elif ner_tag.startswith("L-"):
-                        if is_last:
-                            self.ner[i] = ner_tag
-                        else:
-                            self.ner[i] = ner_tag.replace("L-", "I-", 1)
-                    # Case 4: I. Stays correct
-                    elif ner_tag.startswith("I-"):
-                        self.ner[i] = ner_tag
+        # avoid allocating memory if the doc does not contain any tokens
+        if self.length > 0:
+            if words is None:
+                words = [token.text for token in doc]
+            if tags is None:
+                tags = [None for _ in words]
+            if heads is None:
+                heads = [None for _ in words]
+            if deps is None:
+                deps = [None for _ in words]
+            if morphology is None:
+                morphology = [None for _ in words]
+            if entities is None:
+                entities = ["-" for _ in doc]
+            elif len(entities) == 0:
+                entities = ["O" for _ in doc]
             else:
-                self.words[i] = words[gold_i]
-                self.tags[i] = tags[gold_i]
-                self.morphology[i] = morphology[gold_i]
-                if heads[gold_i] is None:
+                # Translate the None values to '-', to make processing easier.
+                # See Issue #2603
+                entities = [(ent if ent is not None else "-") for ent in entities]
+                if not isinstance(entities[0], basestring):
+                    # Assume we have entities specified by character offset.
+                    entities = biluo_tags_from_offsets(doc, entities)
+
+            # These are filled by the tagger/parser/entity recogniser
+            self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
+            self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
+            self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
+            self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
+            self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
+            self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
+
+            self.words = [None] * len(doc)
+            self.tags = [None] * len(doc)
+            self.heads = [None] * len(doc)
+            self.labels = [None] * len(doc)
+            self.ner = [None] * len(doc)
+            self.morphology = [None] * len(doc)
+
+            # This needs to be done before we align the words
+            if make_projective and heads is not None and deps is not None:
+                heads, deps = nonproj.projectivize(heads, deps)
+
+            # Do many-to-one alignment for misaligned tokens.
+            # If we over-segment, we'll have one gold word that covers a sequence
+            # of predicted words
+            # If we under-segment, we'll have one predicted word that covers a
+            # sequence of gold words.
+            # If we "mis-segment", we'll have a sequence of predicted words covering
+            # a sequence of gold words. That's many-to-many -- we don't do that.
+            cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)
+
+            self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
+            self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
+
+            annot_tuples = (range(len(words)), words, tags, heads, deps, entities)
+            self.orig_annot = list(zip(*annot_tuples))
+
+            for i, gold_i in enumerate(self.cand_to_gold):
+                if doc[i].text.isspace():
+                    self.words[i] = doc[i].text
+                    self.tags[i] = "_SP"
                     self.heads[i] = None
+                    self.labels[i] = None
+                    self.ner[i] = None
+                    self.morphology[i] = set()
+                if gold_i is None:
+                    if i in i2j_multi:
+                        self.words[i] = words[i2j_multi[i]]
+                        self.tags[i] = tags[i2j_multi[i]]
+                        self.morphology[i] = morphology[i2j_multi[i]]
+                        is_last = i2j_multi[i] != i2j_multi.get(i+1)
+                        is_first = i2j_multi[i] != i2j_multi.get(i-1)
+                        # Set next word in multi-token span as head, until last
+                        if not is_last:
+                            self.heads[i] = i+1
+                            self.labels[i] = "subtok"
+                        else:
+                            self.heads[i] = self.gold_to_cand[heads[i2j_multi[i]]]
+                            self.labels[i] = deps[i2j_multi[i]]
+                        # Now set NER...This is annoying because if we've split
+                        # got an entity word split into two, we need to adjust the
+                        # BILUO tags. We can't have BB or LL etc.
+                        # Case 1: O -- easy.
+                        ner_tag = entities[i2j_multi[i]]
+                        if ner_tag == "O":
+                            self.ner[i] = "O"
+                        # Case 2: U. This has to become a B I* L sequence.
+                        elif ner_tag.startswith("U-"):
+                            if is_first:
+                                self.ner[i] = ner_tag.replace("U-", "B-", 1)
+                            elif is_last:
+                                self.ner[i] = ner_tag.replace("U-", "L-", 1)
+                            else:
+                                self.ner[i] = ner_tag.replace("U-", "I-", 1)
+                        # Case 3: L. If not last, change to I.
+                        elif ner_tag.startswith("L-"):
+                            if is_last:
+                                self.ner[i] = ner_tag
+                            else:
+                                self.ner[i] = ner_tag.replace("L-", "I-", 1)
+                        # Case 4: I. Stays correct
+                        elif ner_tag.startswith("I-"):
+                            self.ner[i] = ner_tag
                 else:
-                    self.heads[i] = self.gold_to_cand[heads[gold_i]]
-                self.labels[i] = deps[gold_i]
-                self.ner[i] = entities[gold_i]
+                    self.words[i] = words[gold_i]
+                    self.tags[i] = tags[gold_i]
+                    self.morphology[i] = morphology[gold_i]
+                    if heads[gold_i] is None:
+                        self.heads[i] = None
+                    else:
+                        self.heads[i] = self.gold_to_cand[heads[gold_i]]
+                    self.labels[i] = deps[gold_i]
+                    self.ner[i] = entities[gold_i]
 
-        # Prevent whitespace that isn't within entities from being tagged as
-        # an entity.
-        for i in range(len(self.ner)):
-            if self.tags[i] == "_SP":
-                prev_ner = self.ner[i-1] if i >= 1 else None
-                next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None
-                if prev_ner == "O" or next_ner == "O":
-                    self.ner[i] = "O"
+            # Prevent whitespace that isn't within entities from being tagged as
+            # an entity.
+            for i in range(len(self.ner)):
+                if self.tags[i] == "_SP":
+                    prev_ner = self.ner[i-1] if i >= 1 else None
+                    next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None
+                    if prev_ner == "O" or next_ner == "O":
+                        self.ner[i] = "O"
 
-        cycle = nonproj.contains_cycle(self.heads)
-        if cycle is not None:
-            raise ValueError(Errors.E069.format(cycle=cycle,
-                cycle_tokens=" ".join(["'{}'".format(self.words[tok_id]) for tok_id in cycle]),
-                doc_tokens=" ".join(words[:50])))
+            cycle = nonproj.contains_cycle(self.heads)
+            if cycle is not None:
+                raise ValueError(Errors.E069.format(cycle=cycle,
+                    cycle_tokens=" ".join(["'{}'".format(self.words[tok_id]) for tok_id in cycle]),
+                    doc_tokens=" ".join(words[:50])))
 
     def __len__(self):
         """Get the number of gold-standard tokens.
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 5dd6eab77..af0450592 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -254,7 +254,12 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None,
     cdef PatternStateC state
     cdef int i, j, nr_extra_attr
     cdef Pool mem = Pool()
-    predicate_cache = <char*>mem.alloc(doc.length * len(predicates), sizeof(char))
+    output = []
+    if doc.length == 0:
+        # avoid any processing or mem alloc if the document is empty
+        return output
+    if len(predicates) > 0:
+        predicate_cache = <char*>mem.alloc(doc.length * len(predicates), sizeof(char))
     if extensions is not None and len(extensions) >= 1:
         nr_extra_attr = max(extensions.values()) + 1
         extra_attr_values = <attr_t*>mem.alloc(doc.length * nr_extra_attr, sizeof(attr_t))
@@ -278,7 +283,6 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None,
         predicate_cache += len(predicates)
     # Handle matches that end in 0-width patterns
     finish_states(matches, states)
-    output = []
     seen = set()
     for i in range(matches.size()):
         match = (
@@ -560,12 +564,14 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs)
         for j, (attr, value) in enumerate(spec):
             pattern[i].attrs[j].attr = attr
             pattern[i].attrs[j].value = value
-        pattern[i].extra_attrs = <IndexValueC*>mem.alloc(len(extensions), sizeof(IndexValueC))
+        if len(extensions) > 0:
+            pattern[i].extra_attrs = <IndexValueC*>mem.alloc(len(extensions), sizeof(IndexValueC))
         for j, (index, value) in enumerate(extensions):
             pattern[i].extra_attrs[j].index = index
             pattern[i].extra_attrs[j].value = value
         pattern[i].nr_extra_attr = len(extensions)
-        pattern[i].py_predicates = <int32_t*>mem.alloc(len(predicates), sizeof(int32_t))
+        if len(predicates) > 0:
+            pattern[i].py_predicates = <int32_t*>mem.alloc(len(predicates), sizeof(int32_t))
         for j, index in enumerate(predicates):
             pattern[i].py_predicates[j] = index
         pattern[i].nr_py = len(predicates)
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 55b9c628b..dd19b0e43 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -364,6 +364,9 @@ cdef class Parser:
 
     cdef void c_transition_batch(self, StateC** states, const float* scores,
             int nr_class, int batch_size) nogil:
+        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
+        with gil:
+            assert self.moves.n_moves > 0
         is_valid = <int*>calloc(self.moves.n_moves, sizeof(int))
         cdef int i, guess
         cdef Transition action
@@ -547,6 +550,10 @@ cdef class Parser:
         cdef GoldParse gold
         cdef Pool mem = Pool()
         cdef int i
+
+        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
+        assert self.moves.n_moves > 0
+
         is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
         costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
         cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx
index 58b3a6993..7876813e0 100644
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@@ -83,6 +83,8 @@ cdef class TransitionSystem:
 
     def get_oracle_sequence(self, doc, GoldParse gold):
         cdef Pool mem = Pool()
+        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
+        assert self.n_moves > 0
         costs = <float*>mem.alloc(self.n_moves, sizeof(float))
         is_valid = <int*>mem.alloc(self.n_moves, sizeof(int))
 
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index cdfa55dcb..b39bb1ecb 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -331,6 +331,9 @@ cdef class Tokenizer:
     cdef int _save_cached(self, const TokenC* tokens, hash_t key,
                           int has_special, int n) except -1:
         cdef int i
+        if n <= 0:
+            # avoid mem alloc of zero length
+            return 0
         for i in range(n):
             if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
                 return 0
diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx
index f8b13dd78..5f890de45 100644
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@@ -157,6 +157,9 @@ def _merge(Doc doc, merges):
     cdef TokenC* token
     cdef Pool mem = Pool()
     cdef int merged_iob = 0
+
+    # merges should not be empty, but make sure to avoid zero-length mem alloc
+    assert len(merges) > 0
     tokens = <TokenC**>mem.alloc(len(merges), sizeof(TokenC))
     spans = []
 
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 80a808bae..6afe89e05 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -791,6 +791,8 @@ cdef class Doc:
         # Get set up for fast loading
         cdef Pool mem = Pool()
         cdef int n_attrs = len(attrs)
+        # attrs should not be empty, but make sure to avoid zero-length mem alloc
+        assert n_attrs > 0
         attr_ids = <attr_id_t*>mem.alloc(n_attrs, sizeof(attr_id_t))
         for i, attr_id in enumerate(attrs):
             attr_ids[i] = attr_id

From 3f6cb618a9ec584dd8ac4bcfec97b2f8ec35a725 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 22 Oct 2019 17:47:36 +0200
Subject: [PATCH 3/8] Set version to v2.2.2.dev0

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 7834bfd12..0273b9536 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "2.2.1"
+__version__ = "2.2.2.dev0"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From 74a19aeb1cc384e8ccdff8b664c9df0550be2f92 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 22 Oct 2019 18:18:43 +0200
Subject: [PATCH 4/8] Add xfailing test [ci skip]

---
 spacy/tests/vocab_vectors/test_vectors.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py
index 4705f8e77..214285b01 100644
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@@ -141,6 +141,19 @@ def test_vectors_most_similar(most_similar_vectors_data):
     assert all(row[0] == i for i, row in enumerate(best_rows))
 
 
+@pytest.mark.xfail
+def test_vectors_most_similar_identical():
+    """Test that most similar identical vectors are assigned a score of 1.0."""
+    data = numpy.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f")
+    v = Vectors(data=data, keys=["A", "B", "C"])
+    keys, _, scores = v.most_similar(numpy.asarray([[4, 2, 2, 2]], dtype="f"))
+    assert scores[0][0] == 1.0  # not 1.0000002
+    data = numpy.asarray([[1, 2, 3], [1, 2, 3], [1, 1, 1]], dtype="f")
+    v = Vectors(data=data, keys=["A", "B", "C"])
+    keys, _, scores = v.most_similar(numpy.asarray([[1, 2, 3]], dtype="f"))
+    assert scores[0][0] == 1.0  # not 0.9999999
+
+
 @pytest.mark.parametrize("text", ["apple and orange"])
 def test_vectors_token_vector(tokenizer_v, vectors, text):
     doc = tokenizer_v(text)

From 9489c5f6b2e58f2de4f9ff463982b1b9430a5c7f Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 22 Oct 2019 20:10:42 +0200
Subject: [PATCH 5/8] Clip most_similar to range [-1, 1] (fixes #4506) (#4507)

* Clip most_similar to range [-1, 1]

* Add/fix vectors tests

* Fix test
---
 spacy/tests/vocab_vectors/test_vectors.py | 3 +--
 spacy/vectors.pyx                         | 6 +++++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py
index 214285b01..b688ab9dd 100644
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@@ -141,7 +141,6 @@ def test_vectors_most_similar(most_similar_vectors_data):
     assert all(row[0] == i for i, row in enumerate(best_rows))
 
 
-@pytest.mark.xfail
 def test_vectors_most_similar_identical():
     """Test that most similar identical vectors are assigned a score of 1.0."""
     data = numpy.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f")
@@ -315,4 +314,4 @@ def test_vocab_prune_vectors():
     assert list(remap.keys()) == ["kitten"]
     neighbour, similarity = list(remap.values())[0]
     assert neighbour == "cat", remap
-    assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-6)
+    assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-4, rtol=1e-3)
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 0f015521a..44dddb30c 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -344,8 +344,12 @@ cdef class Vectors:
                 sorted_index = xp.arange(scores.shape[0])[:,None][i:i+batch_size],xp.argsort(scores[i:i+batch_size], axis=1)[:,::-1]
                 scores[i:i+batch_size] = scores[sorted_index]
                 best_rows[i:i+batch_size] = best_rows[sorted_index]
-
+        
         xp = get_array_module(self.data)
+        # Round values really close to 1 or -1
+        scores = xp.around(scores, decimals=4, out=scores)
+        # Account for numerical error we want to return in range -1, 1
+        scores = xp.clip(scores, a_min=-1, a_max=1, out=scores)
         row2key = {row: key for key, row in self.key2row.items()}
         keys = xp.asarray(
             [[row2key[row] for row in best_rows[i] if row in row2key] 

From ca7f0e669e1d7581effa77d44f44fffd193d11c4 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 22 Oct 2019 20:11:25 +0200
Subject: [PATCH 6/8] Set version to v2.2.2.dev1

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 0273b9536..086e53242 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "2.2.2.dev0"
+__version__ = "2.2.2.dev1"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From 835498d24fcdb408e314ef7f43afcbfc5f15f522 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Oct 2019 14:31:09 +0200
Subject: [PATCH 7/8] Update azure-pipelines.yml

---
 azure-pipelines.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index c23995de6..512c6414c 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -9,6 +9,11 @@ trigger:
     exclude:
     - 'website/*'
     - '*.md'
+pr:
+  paths:
+    exclude:
+    - 'website/*'
+    - '*.md'
 
 jobs:
 

From 7fc39f124c2bf9b62c63caccd005e9ae7add078b Mon Sep 17 00:00:00 2001
From: adrianeboyd <adrianeboyd@gmail.com>
Date: Wed, 23 Oct 2019 14:41:21 +0200
Subject: [PATCH 8/8] Fix logic in rules+model entity example [ci skip] (#4510)

---
 website/docs/usage/rule-based-matching.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
index fe8e4e2d2..a375f416c 100644
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@@ -1135,6 +1135,8 @@ def expand_person_entities(doc):
             if prev_token.text in ("Dr", "Dr.", "Mr", "Mr.", "Ms", "Ms."):
                 new_ent = Span(doc, ent.start - 1, ent.end, label=ent.label)
                 new_ents.append(new_ent)
+            else:
+                new_ents.append(ent)
         else:
             new_ents.append(ent)
     doc.ents = new_ents