From 2263bc7b286d69e053139b5b6ccfaca90df7510a Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 26 Jan 2021 04:52:45 +0100
Subject: [PATCH] Update develop from master for v3.0.0rc5 (#6811)

* Fix `spacy.util.minibatch` when the size iterator is finished (#6745)

* Skip 0-length matches (#6759)

Add hack to prevent matcher from returning 0-length matches.

* support IS_SENT_START in PhraseMatcher (#6771)

* support IS_SENT_START in PhraseMatcher

* add unit test and friendlier error

* use IDS.get instead

* ensure span.text works for an empty span (#6772)

* Remove unicode_literals

Co-authored-by: Santiago Castro <bryant@montevideo.com.uy>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/matcher/matcher.pyx                  |  3 ++-
 spacy/matcher/phrasematcher.pyx            |  5 ++++-
 spacy/tests/matcher/test_matcher_api.py    | 19 +++++++++++++++++++
 spacy/tests/matcher/test_phrase_matcher.py |  5 +++++
 spacy/tests/regression/test_issue6755.py   |  5 +++++
 spacy/tokens/span.pyx                      |  2 +-
 6 files changed, 36 insertions(+), 3 deletions(-)
 create mode 100644 spacy/tests/regression/test_issue6755.py

diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 31699bfa1..803e8edac 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -342,7 +342,8 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
         # We need to deduplicate, because we could otherwise arrive at the same
         # match through two paths, e.g. .?.? matching 'a'. Are we matching the
         # first .?, or the second .? -- it doesn't matter, it's just one match.
-        if match not in seen:
+        # Skip 0-length matches. (TODO: fix algorithm)
+        if match not in seen and matches[i].length > 0:
             output.append(match)
             seen.add(match)
     return output
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 7e99859b5..fc5c16506 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -5,6 +5,7 @@ from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
 import warnings
 
 from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA, MORPH
+from ..attrs import IDS
 from ..structs cimport TokenC
 from ..tokens.token cimport Token
 from ..tokens.span cimport Span
@@ -52,9 +53,11 @@ cdef class PhraseMatcher:
             attr = attr.upper()
             if attr == "TEXT":
                 attr = "ORTH"
+            if attr == "IS_SENT_START":
+                attr = "SENT_START"
             if attr.lower() not in TokenPattern().dict():
                 raise ValueError(Errors.E152.format(attr=attr))
-            self.attr = self.vocab.strings[attr]
+            self.attr = IDS.get(attr)
 
     def __len__(self):
         """Get the number of match IDs added to the matcher.
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index 91f843a93..094bf22a6 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -521,3 +521,22 @@ def test_matcher_deprecated(matcher):
             pass
         assert record.list
         assert "spaCy v3.0" in str(record.list[0].message)
+
+
+def test_matcher_remove_zero_operator(en_vocab):
+    matcher = Matcher(en_vocab)
+    pattern = [{"OP": "!"}]
+    matcher.add("Rule", [pattern])
+    doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
+    matches = matcher(doc)
+    assert len(matches) == 0
+    assert "Rule" in matcher
+    matcher.remove("Rule")
+    assert "Rule" not in matcher
+
+
+def test_matcher_no_zero_length(en_vocab):
+    doc = Doc(en_vocab, words=["a", "b"], tags=["A", "B"])
+    matcher = Matcher(en_vocab)
+    matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]])
+    assert len(matcher(doc)) == 0
diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py
index 1b81fd780..e95bd5eba 100644
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@@ -318,3 +318,8 @@ def test_phrase_matcher_deprecated(en_vocab):
             pass
         assert record.list
         assert "spaCy v3.0" in str(record.list[0].message)
+
+
+@pytest.mark.parametrize("attr", ["SENT_START", "IS_SENT_START"])
+def test_phrase_matcher_sent_start(en_vocab, attr):
+    matcher = PhraseMatcher(en_vocab, attr=attr)
diff --git a/spacy/tests/regression/test_issue6755.py b/spacy/tests/regression/test_issue6755.py
new file mode 100644
index 000000000..15ddd6fbc
--- /dev/null
+++ b/spacy/tests/regression/test_issue6755.py
@@ -0,0 +1,5 @@
+def test_issue6755(en_tokenizer):
+    doc = en_tokenizer("This is a magnificent sentence.")
+    span = doc[:0]
+    assert span.text_with_ws == ""
+    assert span.text == ""
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 4e6fb84f5..42b9cc227 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -473,7 +473,7 @@ cdef class Span:
     def text(self):
         """RETURNS (str): The original verbatim text of the span."""
         text = self.text_with_ws
-        if self[-1].whitespace_:
+        if len(self) > 0 and self[-1].whitespace_:
             text = text[:-1]
         return text