mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Update develop from master for v3.0.0rc5 (#6811)
* Fix `spacy.util.minibatch` when the size iterator is finished (#6745) * Skip 0-length matches (#6759) Add hack to prevent matcher from returning 0-length matches. * support IS_SENT_START in PhraseMatcher (#6771) * support IS_SENT_START in PhraseMatcher * add unit test and friendlier error * use IDS.get instead * ensure span.text works for an empty span (#6772) * Remove unicode_literals Co-authored-by: Santiago Castro <bryant@montevideo.com.uy> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
parent
c0926c9088
commit
2263bc7b28
|
@ -342,7 +342,8 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
|
||||||
# We need to deduplicate, because we could otherwise arrive at the same
|
# We need to deduplicate, because we could otherwise arrive at the same
|
||||||
# match through two paths, e.g. .?.? matching 'a'. Are we matching the
|
# match through two paths, e.g. .?.? matching 'a'. Are we matching the
|
||||||
# first .?, or the second .? -- it doesn't matter, it's just one match.
|
# first .?, or the second .? -- it doesn't matter, it's just one match.
|
||||||
if match not in seen:
|
# Skip 0-length matches. (TODO: fix algorithm)
|
||||||
|
if match not in seen and matches[i].length > 0:
|
||||||
output.append(match)
|
output.append(match)
|
||||||
seen.add(match)
|
seen.add(match)
|
||||||
return output
|
return output
|
||||||
|
|
|
@ -5,6 +5,7 @@ from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA, MORPH
|
from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA, MORPH
|
||||||
|
from ..attrs import IDS
|
||||||
from ..structs cimport TokenC
|
from ..structs cimport TokenC
|
||||||
from ..tokens.token cimport Token
|
from ..tokens.token cimport Token
|
||||||
from ..tokens.span cimport Span
|
from ..tokens.span cimport Span
|
||||||
|
@ -52,9 +53,11 @@ cdef class PhraseMatcher:
|
||||||
attr = attr.upper()
|
attr = attr.upper()
|
||||||
if attr == "TEXT":
|
if attr == "TEXT":
|
||||||
attr = "ORTH"
|
attr = "ORTH"
|
||||||
|
if attr == "IS_SENT_START":
|
||||||
|
attr = "SENT_START"
|
||||||
if attr.lower() not in TokenPattern().dict():
|
if attr.lower() not in TokenPattern().dict():
|
||||||
raise ValueError(Errors.E152.format(attr=attr))
|
raise ValueError(Errors.E152.format(attr=attr))
|
||||||
self.attr = self.vocab.strings[attr]
|
self.attr = IDS.get(attr)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""Get the number of match IDs added to the matcher.
|
"""Get the number of match IDs added to the matcher.
|
||||||
|
|
|
@ -521,3 +521,22 @@ def test_matcher_deprecated(matcher):
|
||||||
pass
|
pass
|
||||||
assert record.list
|
assert record.list
|
||||||
assert "spaCy v3.0" in str(record.list[0].message)
|
assert "spaCy v3.0" in str(record.list[0].message)
|
||||||
|
|
||||||
|
|
||||||
|
def test_matcher_remove_zero_operator(en_vocab):
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern = [{"OP": "!"}]
|
||||||
|
matcher.add("Rule", [pattern])
|
||||||
|
doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert len(matches) == 0
|
||||||
|
assert "Rule" in matcher
|
||||||
|
matcher.remove("Rule")
|
||||||
|
assert "Rule" not in matcher
|
||||||
|
|
||||||
|
|
||||||
|
def test_matcher_no_zero_length(en_vocab):
|
||||||
|
doc = Doc(en_vocab, words=["a", "b"], tags=["A", "B"])
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]])
|
||||||
|
assert len(matcher(doc)) == 0
|
||||||
|
|
|
@ -318,3 +318,8 @@ def test_phrase_matcher_deprecated(en_vocab):
|
||||||
pass
|
pass
|
||||||
assert record.list
|
assert record.list
|
||||||
assert "spaCy v3.0" in str(record.list[0].message)
|
assert "spaCy v3.0" in str(record.list[0].message)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("attr", ["SENT_START", "IS_SENT_START"])
|
||||||
|
def test_phrase_matcher_sent_start(en_vocab, attr):
|
||||||
|
matcher = PhraseMatcher(en_vocab, attr=attr)
|
||||||
|
|
5
spacy/tests/regression/test_issue6755.py
Normal file
5
spacy/tests/regression/test_issue6755.py
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
def test_issue6755(en_tokenizer):
|
||||||
|
doc = en_tokenizer("This is a magnificent sentence.")
|
||||||
|
span = doc[:0]
|
||||||
|
assert span.text_with_ws == ""
|
||||||
|
assert span.text == ""
|
|
@ -473,7 +473,7 @@ cdef class Span:
|
||||||
def text(self):
|
def text(self):
|
||||||
"""RETURNS (str): The original verbatim text of the span."""
|
"""RETURNS (str): The original verbatim text of the span."""
|
||||||
text = self.text_with_ws
|
text = self.text_with_ws
|
||||||
if self[-1].whitespace_:
|
if len(self) > 0 and self[-1].whitespace_:
|
||||||
text = text[:-1]
|
text = text[:-1]
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user