bugfix: introducing multiple roots now updates original head's properties

adjust tests to rely less on statistical model
This commit is contained in:
Wolfgang Seeker 2016-04-20 16:40:36 +02:00
parent c356251f45
commit 12024b0b0a
5 changed files with 109 additions and 41 deletions

View File

@ -395,12 +395,57 @@ cdef class ArcEager(TransitionSystem):
cdef int finalize_state(self, StateC* st) nogil:
cdef int i
cdef int orig_head_id
cdef TokenC* orig_head
cdef int new_edge
cdef int child_i
cdef TokenC* head_i
for i in range(st.length):
if st._sent[i].head == 0 and st._sent[i].dep == 0:
st._sent[i].dep = self.root_label
# If we're not using the Break transition, we segment via root-labelled
# arcs between the root words.
elif USE_ROOT_ARC_SEGMENT and st._sent[i].dep == self.root_label:
orig_head_id = st._sent[i].head
orig_head = &st._sent[orig_head_id]
if i < orig_head_id: # i is left dependent
orig_head.l_kids -= 1
if i == orig_head.l_edge: # i is left-most child
# find the second left-most child and make it the new l_edge
new_edge = orig_head_id
child_i = i
while child_i < orig_head_id:
if st._sent[child_i].head == orig_head_id:
new_edge = child_i
child_i += 1
# then walk up the path to root and update the l_edges of all ancestors
# the logic here works because the tree is guaranteed to be projective
head_i = &st._sent[orig_head.head]
while head_i.l_edge == orig_head.l_edge:
head_i.l_edge = new_edge
head_i = &st._sent[head_i.head]
orig_head.l_edge = new_edge
elif i > orig_head_id: # i is right dependent
orig_head.r_kids -= 1
if i == orig_head.r_edge:
# find the second right-most child and make it the new r_edge
new_edge = orig_head_id
child_i = i
while child_i > orig_head_id:
if st._sent[child_i].head == orig_head_id:
new_edge = child_i
child_i -= 1
# then walk up the path to root and update the l_edges of all ancestors
# the logic here works because the tree is guaranteed to be projective
head_i = &st._sent[orig_head.head]
while head_i.r_edge == orig_head.r_edge:
head_i.r_edge = new_edge
head_i = &st._sent[head_i.head]
orig_head.r_edge = new_edge
# note that this can create non-projective trees if there are arcs
# between nodes on both sides of the new root node
st._sent[i].head = 0
cdef int set_valid(self, int* output, const StateC* st) nogil:

View File

@ -6,7 +6,7 @@ import spacy
from spacy.matcher import Matcher
from spacy.attrs import ORTH, LOWER, ENT_IOB, ENT_TYPE
from spacy.attrs import ORTH, TAG, LOWER, IS_ALPHA, FLAG63
from spacy.symbols import DATE
from spacy.symbols import DATE, LOC
def test_overlap_issue118(EN):
@ -134,15 +134,59 @@ def test_overlap_prefix_reorder(EN):
assert ents[0].end == 11
@pytest.mark.models
def test_ner_interaction(EN):
EN.matcher.add('LAX_Airport', 'AIRPORT', {}, [[{ORTH: 'LAX'}]])
EN.matcher.add('SFO_Airport', 'AIRPORT', {}, [[{ORTH: 'SFO'}]])
doc = EN(u'get me a flight from SFO to LAX leaving 20 December and arriving on January 5th')
# @pytest.mark.models
# def test_ner_interaction(EN):
# EN.matcher.add('LAX_Airport', 'AIRPORT', {}, [[{ORTH: 'LAX'}]])
# EN.matcher.add('SFO_Airport', 'AIRPORT', {}, [[{ORTH: 'SFO'}]])
# doc = EN(u'get me a flight from SFO to LAX leaving 20 December and arriving on January 5th')
ents = [(ent.label_, ent.text) for ent in doc.ents]
assert ents[0] == ('AIRPORT', 'SFO')
assert ents[1] == ('AIRPORT', 'LAX')
assert ents[2] == ('DATE', '20 December')
assert ents[3] == ('DATE', 'January 5th')
# ents = [(ent.label_, ent.text) for ent in doc.ents]
# assert ents[0] == ('AIRPORT', 'SFO')
# assert ents[1] == ('AIRPORT', 'LAX')
# assert ents[2] == ('DATE', '20 December')
# assert ents[3] == ('DATE', 'January 5th')
# @pytest.mark.models
# def test_ner_interaction(EN):
# # ensure that matcher doesn't overwrite annotations set by the NER model
# doc = EN.tokenizer.tokens_from_list(u'get me a flight from SFO to LAX leaving 20 December and arriving on January 5th'.split(' '))
# EN.tagger(doc)
# columns = [ENT_IOB, ENT_TYPE]
# values = numpy.ndarray(shape=(len(doc),len(columns)), dtype='int32')
# # IOB values are 0=missing, 1=I, 2=O, 3=B
# iobs = [2,2,2,2,2,3,2,3,2,3,1,2,2,2,3,1]
# types = [0,0,0,0,0,LOC,0,LOC,0,DATE,DATE,0,0,0,DATE,DATE]
# values[:] = zip(iobs,types)
# doc.from_array(columns,values)
# assert doc[5].ent_type_ == 'LOC'
# assert doc[7].ent_type_ == 'LOC'
# assert doc[9].ent_type_ == 'DATE'
# assert doc[10].ent_type_ == 'DATE'
# assert doc[14].ent_type_ == 'DATE'
# assert doc[15].ent_type_ == 'DATE'
# EN.matcher.add('LAX_Airport', 'AIRPORT', {}, [[{ORTH: 'LAX'}]])
# EN.matcher.add('SFO_Airport', 'AIRPORT', {}, [[{ORTH: 'SFO'}]])
# EN.matcher(doc)
# assert doc[5].ent_type_ != 'AIRPORT'
# assert doc[7].ent_type_ != 'AIRPORT'
# assert doc[5].ent_type_ == 'LOC'
# assert doc[7].ent_type_ == 'LOC'
# assert doc[9].ent_type_ == 'DATE'
# assert doc[10].ent_type_ == 'DATE'
# assert doc[14].ent_type_ == 'DATE'
# assert doc[15].ent_type_ == 'DATE'

View File

@ -57,8 +57,7 @@ def test_child_consistency(EN, sun_text):
@pytest.mark.models
def test_edges(EN):
sun_text = u"Chemically, about three quarters of the Sun's mass consists of hydrogen, while the rest is mostly helium."
def test_edges(EN, sun_text):
tokens = EN(sun_text)
for token in tokens:
subtree = list(token.subtree)

View File

@ -3,30 +3,10 @@ from __future__ import unicode_literals
from spacy.en import English
import pytest
@pytest.fixture
def tagged(EN):
string = u'Bananas in pyjamas are geese.'
tokens = EN(string, tag=True, parse=False)
return tokens
@pytest.fixture
def lemmas(tagged):
return [t.lemma_ for t in tagged]
@pytest.mark.models
def test_lemmas(lemmas, tagged):
assert lemmas[0] == 'banana'
assert lemmas[1] == 'in'
assert lemmas[2] == 'pyjama'
assert lemmas[3] == 'be'
if tagged[2].tag == tagged[4].tag:
assert lemmas[4] == 'goose'
def test_didnt(EN):
tokens = EN(u"I didn't do it")
assert tokens[1].lemma_ != u""
def test_lemma_assignment(EN):
tokens = u'Bananas in pyjamas are geese .'.split(' ')
doc = EN.tokenizer.tokens_from_list(tokens)
assert all( t.lemma_ == u'' for t in doc )
EN.tagger(doc)
assert all( t.lemma_ != u'' for t in doc )

View File

@ -190,5 +190,5 @@ def test_right_edge(EN):
token = doc[6]
assert token.text == u'for'
subtree = [w.text for w in token.subtree]
assert subtree == [u'for' , u'the', u'sake', u'of']
assert token.right_edge.text == u'of'
assert subtree == [u'for' , u'the', u'sake', u'of', u'such', u'as', u'live', u'under', u'the', u'government', u'of', u'the', u'Romans', u',']
assert token.right_edge.text == u','