2017-01-10 21:24:10 +03:00
|
|
|
# coding: utf-8
|
2016-11-24 01:48:41 +03:00
|
|
|
from __future__ import unicode_literals
|
2017-01-10 21:24:10 +03:00
|
|
|
|
2017-01-12 18:49:40 +03:00
|
|
|
from ...matcher import Matcher
|
|
|
|
from ...attrs import ORTH
|
|
|
|
|
|
|
|
|
|
|
|
def test_issue615(en_tokenizer):
|
|
|
|
def merge_phrases(matcher, doc, i, matches):
|
|
|
|
"""Merge a phrase. We have to be careful here because we'll change the
|
|
|
|
token indices. To avoid problems, merge all the phrases once we're called
|
|
|
|
on the last match."""
|
|
|
|
|
|
|
|
if i != len(matches)-1:
|
|
|
|
return None
|
|
|
|
# Get Span objects
|
|
|
|
spans = [(ent_id, label, doc[start : end]) for ent_id, label, start, end in matches]
|
|
|
|
for ent_id, label, span in spans:
|
|
|
|
span.merge('NNP' if label else span.root.tag_, span.text, doc.vocab.strings[label])
|
|
|
|
|
|
|
|
text = "The golf club is broken"
|
2017-01-13 00:00:37 +03:00
|
|
|
pattern = [{ORTH: "golf"}, {ORTH: "club"}]
|
2017-01-12 18:49:40 +03:00
|
|
|
label = "Sport_Equipment"
|
|
|
|
|
2017-01-12 19:33:18 +03:00
|
|
|
doc = en_tokenizer(text)
|
2017-01-12 18:49:40 +03:00
|
|
|
matcher = Matcher(doc.vocab)
|
|
|
|
matcher.add_entity(label, on_match=merge_phrases)
|
|
|
|
matcher.add_pattern(label, pattern, label=label)
|
2016-11-24 01:48:41 +03:00
|
|
|
|
|
|
|
match = matcher(doc)
|
|
|
|
entities = list(doc.ents)
|
|
|
|
|
|
|
|
assert entities != [] #assertion 1
|
|
|
|
assert entities[0].label != 0 #assertion 2
|