2018-07-25 00:38:44 +03:00
|
|
|
# coding: utf8
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
|
|
from spacy.matcher import Matcher
|
|
|
|
from spacy.tokens import Span
|
|
|
|
|
|
|
|
|
|
|
|
def test_issue2569(en_tokenizer):
|
|
|
|
doc = en_tokenizer("It is May 15, 1993.")
|
2018-11-27 03:09:36 +03:00
|
|
|
doc.ents = [Span(doc, 2, 6, label=doc.vocab.strings["DATE"])]
|
2018-07-25 00:38:44 +03:00
|
|
|
matcher = Matcher(doc.vocab)
|
2018-11-27 03:09:36 +03:00
|
|
|
matcher.add("RULE", None, [{"ENT_TYPE": "DATE", "OP": "+"}])
|
2018-07-25 00:38:44 +03:00
|
|
|
matched = [doc[start:end] for _, start, end in matcher(doc)]
|
|
|
|
matched = sorted(matched, key=len, reverse=True)
|
|
|
|
assert len(matched) == 10
|
|
|
|
assert len(matched[0]) == 4
|
2018-11-27 03:09:36 +03:00
|
|
|
assert matched[0].text == "May 15, 1993"
|