From 2263bc7b286d69e053139b5b6ccfaca90df7510a Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 26 Jan 2021 04:52:45 +0100 Subject: [PATCH] Update develop from master for v3.0.0rc5 (#6811) * Fix `spacy.util.minibatch` when the size iterator is finished (#6745) * Skip 0-length matches (#6759) Add hack to prevent matcher from returning 0-length matches. * support IS_SENT_START in PhraseMatcher (#6771) * support IS_SENT_START in PhraseMatcher * add unit test and friendlier error * use IDS.get instead * ensure span.text works for an empty span (#6772) * Remove unicode_literals Co-authored-by: Santiago Castro Co-authored-by: Sofie Van Landeghem --- spacy/matcher/matcher.pyx | 3 ++- spacy/matcher/phrasematcher.pyx | 5 ++++- spacy/tests/matcher/test_matcher_api.py | 19 +++++++++++++++++++ spacy/tests/matcher/test_phrase_matcher.py | 5 +++++ spacy/tests/regression/test_issue6755.py | 5 +++++ spacy/tokens/span.pyx | 2 +- 6 files changed, 36 insertions(+), 3 deletions(-) create mode 100644 spacy/tests/regression/test_issue6755.py diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 31699bfa1..803e8edac 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -342,7 +342,8 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e # We need to deduplicate, because we could otherwise arrive at the same # match through two paths, e.g. .?.? matching 'a'. Are we matching the # first .?, or the second .? -- it doesn't matter, it's just one match. - if match not in seen: + # Skip 0-length matches. (TODO: fix algorithm) + if match not in seen and matches[i].length > 0: output.append(match) seen.add(match) return output diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 7e99859b5..fc5c16506 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -5,6 +5,7 @@ from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter import warnings from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA, MORPH +from ..attrs import IDS from ..structs cimport TokenC from ..tokens.token cimport Token from ..tokens.span cimport Span @@ -52,9 +53,11 @@ cdef class PhraseMatcher: attr = attr.upper() if attr == "TEXT": attr = "ORTH" + if attr == "IS_SENT_START": + attr = "SENT_START" if attr.lower() not in TokenPattern().dict(): raise ValueError(Errors.E152.format(attr=attr)) - self.attr = self.vocab.strings[attr] + self.attr = IDS.get(attr) def __len__(self): """Get the number of match IDs added to the matcher. diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 91f843a93..094bf22a6 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -521,3 +521,22 @@ def test_matcher_deprecated(matcher): pass assert record.list assert "spaCy v3.0" in str(record.list[0].message) + + +def test_matcher_remove_zero_operator(en_vocab): + matcher = Matcher(en_vocab) + pattern = [{"OP": "!"}] + matcher.add("Rule", [pattern]) + doc = Doc(en_vocab, words=["This", "is", "a", "test", "."]) + matches = matcher(doc) + assert len(matches) == 0 + assert "Rule" in matcher + matcher.remove("Rule") + assert "Rule" not in matcher + + +def test_matcher_no_zero_length(en_vocab): + doc = Doc(en_vocab, words=["a", "b"], tags=["A", "B"]) + matcher = Matcher(en_vocab) + matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]]) + assert len(matcher(doc)) == 0 diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index 1b81fd780..e95bd5eba 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -318,3 +318,8 @@ def test_phrase_matcher_deprecated(en_vocab): pass assert record.list assert "spaCy v3.0" in str(record.list[0].message) + + +@pytest.mark.parametrize("attr", ["SENT_START", "IS_SENT_START"]) +def test_phrase_matcher_sent_start(en_vocab, attr): + matcher = PhraseMatcher(en_vocab, attr=attr) diff --git a/spacy/tests/regression/test_issue6755.py b/spacy/tests/regression/test_issue6755.py new file mode 100644 index 000000000..15ddd6fbc --- /dev/null +++ b/spacy/tests/regression/test_issue6755.py @@ -0,0 +1,5 @@ +def test_issue6755(en_tokenizer): + doc = en_tokenizer("This is a magnificent sentence.") + span = doc[:0] + assert span.text_with_ws == "" + assert span.text == "" diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 4e6fb84f5..42b9cc227 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -473,7 +473,7 @@ cdef class Span: def text(self): """RETURNS (str): The original verbatim text of the span.""" text = self.text_with_ws - if self[-1].whitespace_: + if len(self) > 0 and self[-1].whitespace_: text = text[:-1] return text