2015-11-08 18:19:33 +03:00
from __future__ import unicode_literals , print_function
2015-07-23 02:19:11 +03:00
import pytest
2015-04-13 22:33:54 +03:00
2015-11-11 20:56:07 +03:00
from spacy . attrs import LOWER
from spacy . matcher import Matcher
2017-05-29 23:14:31 +03:00
@pytest.mark.models ( ' en ' )
def test_en_ner_simple_types ( EN ) :
2015-06-07 19:02:24 +03:00
tokens = EN ( u ' Mr. Best flew to New York on Saturday morning. ' )
2015-04-13 22:33:54 +03:00
ents = list ( tokens . ents )
assert ents [ 0 ] . start == 1
assert ents [ 0 ] . end == 2
assert ents [ 0 ] . label_ == ' PERSON '
assert ents [ 1 ] . start == 4
assert ents [ 1 ] . end == 6
assert ents [ 1 ] . label_ == ' GPE '
2015-11-08 15:57:15 +03:00
2017-06-05 03:09:27 +03:00
@pytest.mark.skip
2017-05-29 23:14:31 +03:00
@pytest.mark.models ( ' en ' )
def test_en_ner_consistency_bug ( EN ) :
2015-11-08 15:57:15 +03:00
''' Test an arbitrary sequence-consistency bug encountered during speed test '''
tokens = EN ( u ' Where rap essentially went mainstream, illustrated by seminal Public Enemy, Beastie Boys and L.L. Cool J. tracks. ' )
2017-06-05 03:09:27 +03:00
tokens = EN ( u ''' Charity and other short-term aid have buoyed them so far, and a tax-relief bill working its way through Congress would help. But the September 11 Victim Compensation Fund, enacted by Congress to discourage people from filing lawsuits, will determine the shape of their lives for years to come. \n \n ''' , disable = [ ' ner ' ] )
2016-09-24 02:17:03 +03:00
tokens . ents + = tuple ( EN . matcher ( tokens ) )
2015-11-08 18:19:33 +03:00
EN . entity ( tokens )
2015-11-11 20:56:07 +03:00
2017-06-05 00:35:06 +03:00
@pytest.mark.skip
2017-05-29 23:14:31 +03:00
@pytest.mark.models ( ' en ' )
def test_en_ner_unit_end_gazetteer ( EN ) :
2015-11-11 20:56:07 +03:00
''' Test a bug in the interaction between the NER model and the gazetteer '''
2017-05-22 14:54:20 +03:00
matcher = Matcher ( EN . vocab )
matcher . add ( ' MemberNames ' , None , [ { LOWER : ' cal ' } ] , [ { LOWER : ' cal ' } , { LOWER : ' henderson ' } ] )
2015-11-11 20:56:07 +03:00
doc = EN ( u ' who is cal the manager of? ' )
if len ( list ( doc . ents ) ) == 0 :
ents = matcher ( doc )
assert len ( ents ) == 1
2016-09-24 02:17:03 +03:00
doc . ents + = tuple ( ents )
2015-11-11 20:58:53 +03:00
EN . entity ( doc )
2015-11-11 21:00:40 +03:00
assert list ( doc . ents ) [ 0 ] . text == ' cal '