Update develop from master for v3.0.0rc5 (#6811)

* Fix `spacy.util.minibatch` when the size iterator is finished (#6745)

* Skip 0-length matches (#6759)

Add hack to prevent matcher from returning 0-length matches.

* support IS_SENT_START in PhraseMatcher (#6771)

* support IS_SENT_START in PhraseMatcher

* add unit test and friendlier error

* use IDS.get instead

* ensure span.text works for an empty span (#6772)

* Remove unicode_literals

Co-authored-by: Santiago Castro <bryant@montevideo.com.uy>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
Adriane Boyd 2021-01-26 04:52:45 +01:00 committed by GitHub
parent c0926c9088
commit 2263bc7b28
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 36 additions and 3 deletions

View File

@ -342,7 +342,8 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
# We need to deduplicate, because we could otherwise arrive at the same # We need to deduplicate, because we could otherwise arrive at the same
# match through two paths, e.g. .?.? matching 'a'. Are we matching the # match through two paths, e.g. .?.? matching 'a'. Are we matching the
# first .?, or the second .? -- it doesn't matter, it's just one match. # first .?, or the second .? -- it doesn't matter, it's just one match.
if match not in seen: # Skip 0-length matches. (TODO: fix algorithm)
if match not in seen and matches[i].length > 0:
output.append(match) output.append(match)
seen.add(match) seen.add(match)
return output return output

View File

@ -5,6 +5,7 @@ from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
import warnings import warnings
from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA, MORPH from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA, MORPH
from ..attrs import IDS
from ..structs cimport TokenC from ..structs cimport TokenC
from ..tokens.token cimport Token from ..tokens.token cimport Token
from ..tokens.span cimport Span from ..tokens.span cimport Span
@ -52,9 +53,11 @@ cdef class PhraseMatcher:
attr = attr.upper() attr = attr.upper()
if attr == "TEXT": if attr == "TEXT":
attr = "ORTH" attr = "ORTH"
if attr == "IS_SENT_START":
attr = "SENT_START"
if attr.lower() not in TokenPattern().dict(): if attr.lower() not in TokenPattern().dict():
raise ValueError(Errors.E152.format(attr=attr)) raise ValueError(Errors.E152.format(attr=attr))
self.attr = self.vocab.strings[attr] self.attr = IDS.get(attr)
def __len__(self): def __len__(self):
"""Get the number of match IDs added to the matcher. """Get the number of match IDs added to the matcher.

View File

@ -521,3 +521,22 @@ def test_matcher_deprecated(matcher):
pass pass
assert record.list assert record.list
assert "spaCy v3.0" in str(record.list[0].message) assert "spaCy v3.0" in str(record.list[0].message)
def test_matcher_remove_zero_operator(en_vocab):
matcher = Matcher(en_vocab)
pattern = [{"OP": "!"}]
matcher.add("Rule", [pattern])
doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
matches = matcher(doc)
assert len(matches) == 0
assert "Rule" in matcher
matcher.remove("Rule")
assert "Rule" not in matcher
def test_matcher_no_zero_length(en_vocab):
doc = Doc(en_vocab, words=["a", "b"], tags=["A", "B"])
matcher = Matcher(en_vocab)
matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]])
assert len(matcher(doc)) == 0

View File

@ -318,3 +318,8 @@ def test_phrase_matcher_deprecated(en_vocab):
pass pass
assert record.list assert record.list
assert "spaCy v3.0" in str(record.list[0].message) assert "spaCy v3.0" in str(record.list[0].message)
@pytest.mark.parametrize("attr", ["SENT_START", "IS_SENT_START"])
def test_phrase_matcher_sent_start(en_vocab, attr):
matcher = PhraseMatcher(en_vocab, attr=attr)

View File

@ -0,0 +1,5 @@
def test_issue6755(en_tokenizer):
doc = en_tokenizer("This is a magnificent sentence.")
span = doc[:0]
assert span.text_with_ws == ""
assert span.text == ""

View File

@ -473,7 +473,7 @@ cdef class Span:
def text(self): def text(self):
"""RETURNS (str): The original verbatim text of the span.""" """RETURNS (str): The original verbatim text of the span."""
text = self.text_with_ws text = self.text_with_ws
if self[-1].whitespace_: if len(self) > 0 and self[-1].whitespace_:
text = text[:-1] text = text[:-1]
return text return text