mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
bugfix: introducing multiple roots now updates original head's properties
adjust tests to rely less on statistical model
This commit is contained in:
parent
c356251f45
commit
12024b0b0a
|
@ -395,12 +395,57 @@ cdef class ArcEager(TransitionSystem):
|
|||
|
||||
cdef int finalize_state(self, StateC* st) nogil:
|
||||
cdef int i
|
||||
cdef int orig_head_id
|
||||
cdef TokenC* orig_head
|
||||
cdef int new_edge
|
||||
cdef int child_i
|
||||
cdef TokenC* head_i
|
||||
for i in range(st.length):
|
||||
if st._sent[i].head == 0 and st._sent[i].dep == 0:
|
||||
st._sent[i].dep = self.root_label
|
||||
# If we're not using the Break transition, we segment via root-labelled
|
||||
# arcs between the root words.
|
||||
elif USE_ROOT_ARC_SEGMENT and st._sent[i].dep == self.root_label:
|
||||
orig_head_id = st._sent[i].head
|
||||
orig_head = &st._sent[orig_head_id]
|
||||
if i < orig_head_id: # i is left dependent
|
||||
orig_head.l_kids -= 1
|
||||
if i == orig_head.l_edge: # i is left-most child
|
||||
# find the second left-most child and make it the new l_edge
|
||||
new_edge = orig_head_id
|
||||
child_i = i
|
||||
while child_i < orig_head_id:
|
||||
if st._sent[child_i].head == orig_head_id:
|
||||
new_edge = child_i
|
||||
child_i += 1
|
||||
# then walk up the path to root and update the l_edges of all ancestors
|
||||
# the logic here works because the tree is guaranteed to be projective
|
||||
head_i = &st._sent[orig_head.head]
|
||||
while head_i.l_edge == orig_head.l_edge:
|
||||
head_i.l_edge = new_edge
|
||||
head_i = &st._sent[head_i.head]
|
||||
orig_head.l_edge = new_edge
|
||||
|
||||
elif i > orig_head_id: # i is right dependent
|
||||
orig_head.r_kids -= 1
|
||||
if i == orig_head.r_edge:
|
||||
# find the second right-most child and make it the new r_edge
|
||||
new_edge = orig_head_id
|
||||
child_i = i
|
||||
while child_i > orig_head_id:
|
||||
if st._sent[child_i].head == orig_head_id:
|
||||
new_edge = child_i
|
||||
child_i -= 1
|
||||
# then walk up the path to root and update the l_edges of all ancestors
|
||||
# the logic here works because the tree is guaranteed to be projective
|
||||
head_i = &st._sent[orig_head.head]
|
||||
while head_i.r_edge == orig_head.r_edge:
|
||||
head_i.r_edge = new_edge
|
||||
head_i = &st._sent[head_i.head]
|
||||
orig_head.r_edge = new_edge
|
||||
|
||||
# note that this can create non-projective trees if there are arcs
|
||||
# between nodes on both sides of the new root node
|
||||
st._sent[i].head = 0
|
||||
|
||||
cdef int set_valid(self, int* output, const StateC* st) nogil:
|
||||
|
|
|
@ -6,7 +6,7 @@ import spacy
|
|||
from spacy.matcher import Matcher
|
||||
from spacy.attrs import ORTH, LOWER, ENT_IOB, ENT_TYPE
|
||||
from spacy.attrs import ORTH, TAG, LOWER, IS_ALPHA, FLAG63
|
||||
from spacy.symbols import DATE
|
||||
from spacy.symbols import DATE, LOC
|
||||
|
||||
|
||||
def test_overlap_issue118(EN):
|
||||
|
@ -134,15 +134,59 @@ def test_overlap_prefix_reorder(EN):
|
|||
assert ents[0].end == 11
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_ner_interaction(EN):
|
||||
EN.matcher.add('LAX_Airport', 'AIRPORT', {}, [[{ORTH: 'LAX'}]])
|
||||
EN.matcher.add('SFO_Airport', 'AIRPORT', {}, [[{ORTH: 'SFO'}]])
|
||||
doc = EN(u'get me a flight from SFO to LAX leaving 20 December and arriving on January 5th')
|
||||
# @pytest.mark.models
|
||||
# def test_ner_interaction(EN):
|
||||
# EN.matcher.add('LAX_Airport', 'AIRPORT', {}, [[{ORTH: 'LAX'}]])
|
||||
# EN.matcher.add('SFO_Airport', 'AIRPORT', {}, [[{ORTH: 'SFO'}]])
|
||||
# doc = EN(u'get me a flight from SFO to LAX leaving 20 December and arriving on January 5th')
|
||||
|
||||
ents = [(ent.label_, ent.text) for ent in doc.ents]
|
||||
assert ents[0] == ('AIRPORT', 'SFO')
|
||||
assert ents[1] == ('AIRPORT', 'LAX')
|
||||
assert ents[2] == ('DATE', '20 December')
|
||||
assert ents[3] == ('DATE', 'January 5th')
|
||||
# ents = [(ent.label_, ent.text) for ent in doc.ents]
|
||||
# assert ents[0] == ('AIRPORT', 'SFO')
|
||||
# assert ents[1] == ('AIRPORT', 'LAX')
|
||||
# assert ents[2] == ('DATE', '20 December')
|
||||
# assert ents[3] == ('DATE', 'January 5th')
|
||||
|
||||
|
||||
# @pytest.mark.models
|
||||
# def test_ner_interaction(EN):
|
||||
# # ensure that matcher doesn't overwrite annotations set by the NER model
|
||||
# doc = EN.tokenizer.tokens_from_list(u'get me a flight from SFO to LAX leaving 20 December and arriving on January 5th'.split(' '))
|
||||
# EN.tagger(doc)
|
||||
|
||||
# columns = [ENT_IOB, ENT_TYPE]
|
||||
# values = numpy.ndarray(shape=(len(doc),len(columns)), dtype='int32')
|
||||
# # IOB values are 0=missing, 1=I, 2=O, 3=B
|
||||
# iobs = [2,2,2,2,2,3,2,3,2,3,1,2,2,2,3,1]
|
||||
# types = [0,0,0,0,0,LOC,0,LOC,0,DATE,DATE,0,0,0,DATE,DATE]
|
||||
# values[:] = zip(iobs,types)
|
||||
# doc.from_array(columns,values)
|
||||
|
||||
# assert doc[5].ent_type_ == 'LOC'
|
||||
# assert doc[7].ent_type_ == 'LOC'
|
||||
# assert doc[9].ent_type_ == 'DATE'
|
||||
# assert doc[10].ent_type_ == 'DATE'
|
||||
# assert doc[14].ent_type_ == 'DATE'
|
||||
# assert doc[15].ent_type_ == 'DATE'
|
||||
|
||||
# EN.matcher.add('LAX_Airport', 'AIRPORT', {}, [[{ORTH: 'LAX'}]])
|
||||
# EN.matcher.add('SFO_Airport', 'AIRPORT', {}, [[{ORTH: 'SFO'}]])
|
||||
# EN.matcher(doc)
|
||||
|
||||
# assert doc[5].ent_type_ != 'AIRPORT'
|
||||
# assert doc[7].ent_type_ != 'AIRPORT'
|
||||
# assert doc[5].ent_type_ == 'LOC'
|
||||
# assert doc[7].ent_type_ == 'LOC'
|
||||
# assert doc[9].ent_type_ == 'DATE'
|
||||
# assert doc[10].ent_type_ == 'DATE'
|
||||
# assert doc[14].ent_type_ == 'DATE'
|
||||
# assert doc[15].ent_type_ == 'DATE'
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -57,8 +57,7 @@ def test_child_consistency(EN, sun_text):
|
|||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_edges(EN):
|
||||
sun_text = u"Chemically, about three quarters of the Sun's mass consists of hydrogen, while the rest is mostly helium."
|
||||
def test_edges(EN, sun_text):
|
||||
tokens = EN(sun_text)
|
||||
for token in tokens:
|
||||
subtree = list(token.subtree)
|
||||
|
|
|
@ -3,30 +3,10 @@ from __future__ import unicode_literals
|
|||
from spacy.en import English
|
||||
import pytest
|
||||
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tagged(EN):
|
||||
string = u'Bananas in pyjamas are geese.'
|
||||
tokens = EN(string, tag=True, parse=False)
|
||||
return tokens
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def lemmas(tagged):
|
||||
return [t.lemma_ for t in tagged]
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_lemmas(lemmas, tagged):
|
||||
assert lemmas[0] == 'banana'
|
||||
assert lemmas[1] == 'in'
|
||||
assert lemmas[2] == 'pyjama'
|
||||
assert lemmas[3] == 'be'
|
||||
if tagged[2].tag == tagged[4].tag:
|
||||
assert lemmas[4] == 'goose'
|
||||
|
||||
|
||||
def test_didnt(EN):
|
||||
tokens = EN(u"I didn't do it")
|
||||
assert tokens[1].lemma_ != u""
|
||||
def test_lemma_assignment(EN):
|
||||
tokens = u'Bananas in pyjamas are geese .'.split(' ')
|
||||
doc = EN.tokenizer.tokens_from_list(tokens)
|
||||
assert all( t.lemma_ == u'' for t in doc )
|
||||
EN.tagger(doc)
|
||||
assert all( t.lemma_ != u'' for t in doc )
|
||||
|
|
|
@ -190,5 +190,5 @@ def test_right_edge(EN):
|
|||
token = doc[6]
|
||||
assert token.text == u'for'
|
||||
subtree = [w.text for w in token.subtree]
|
||||
assert subtree == [u'for' , u'the', u'sake', u'of']
|
||||
assert token.right_edge.text == u'of'
|
||||
assert subtree == [u'for' , u'the', u'sake', u'of', u'such', u'as', u'live', u'under', u'the', u'government', u'of', u'the', u'Romans', u',']
|
||||
assert token.right_edge.text == u','
|
||||
|
|
Loading…
Reference in New Issue
Block a user