bugfix: introducing multiple roots now updates original head's properties

adjust tests to rely less on statistical model
2025-08-04 04:10:20 +03:00 · 2016-04-20 16:40:36 +02:00 · 2016-04-20 16:40:36 +02:00 · 12024b0b0a
commit 12024b0b0a
parent c356251f45
5 changed files with 109 additions and 41 deletions
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -395,12 +395,57 @@ cdef class ArcEager(TransitionSystem):

    cdef int finalize_state(self, StateC* st) nogil:
        cdef int i
+        cdef int orig_head_id
+        cdef TokenC* orig_head
+        cdef int new_edge
+        cdef int child_i
+        cdef TokenC* head_i
        for i in range(st.length):
            if st._sent[i].head == 0 and st._sent[i].dep == 0:
                st._sent[i].dep = self.root_label
            # If we're not using the Break transition, we segment via root-labelled
            # arcs between the root words.
            elif USE_ROOT_ARC_SEGMENT and st._sent[i].dep == self.root_label:
+                orig_head_id = st._sent[i].head
+                orig_head = &st._sent[orig_head_id]
+                if i < orig_head_id: # i is left dependent
+                    orig_head.l_kids -= 1
+                    if i == orig_head.l_edge: # i is left-most child
+                        # find the second left-most child and make it the new l_edge
+                        new_edge = orig_head_id
+                        child_i = i
+                        while child_i < orig_head_id:
+                            if st._sent[child_i].head == orig_head_id:
+                                new_edge = child_i
+                            child_i += 1
+                        # then walk up the path to root and update the l_edges of all ancestors
+                        # the logic here works because the tree is guaranteed to be projective
+                        head_i = &st._sent[orig_head.head]
+                        while head_i.l_edge == orig_head.l_edge:
+                            head_i.l_edge = new_edge
+                            head_i = &st._sent[head_i.head]
+                        orig_head.l_edge = new_edge
+
+                elif i > orig_head_id: # i is right dependent
+                    orig_head.r_kids -= 1
+                    if i == orig_head.r_edge:
+                        # find the second right-most child and make it the new r_edge
+                        new_edge = orig_head_id
+                        child_i = i
+                        while child_i > orig_head_id:
+                            if st._sent[child_i].head == orig_head_id:
+                                new_edge = child_i
+                            child_i -= 1
+                        # then walk up the path to root and update the l_edges of all ancestors
+                        # the logic here works because the tree is guaranteed to be projective
+                        head_i = &st._sent[orig_head.head]
+                        while head_i.r_edge == orig_head.r_edge:
+                            head_i.r_edge = new_edge
+                            head_i = &st._sent[head_i.head]
+                        orig_head.r_edge = new_edge
+
+                # note that this can create non-projective trees if there are arcs
+                # between nodes on both sides of the new root node
                st._sent[i].head = 0

    cdef int set_valid(self, int* output, const StateC* st) nogil:
--- a/spacy/tests/matcher/test_matcher_bugfixes.py
+++ b/spacy/tests/matcher/test_matcher_bugfixes.py
@ -6,7 +6,7 @@ import spacy
 from spacy.matcher import Matcher
 from spacy.attrs import ORTH, LOWER, ENT_IOB, ENT_TYPE
 from spacy.attrs import ORTH, TAG, LOWER, IS_ALPHA, FLAG63
-from spacy.symbols import DATE
+from spacy.symbols import DATE, LOC


 def test_overlap_issue118(EN):
@ -134,15 +134,59 @@ def test_overlap_prefix_reorder(EN):
    assert ents[0].end == 11


-@pytest.mark.models
-def test_ner_interaction(EN):
-    EN.matcher.add('LAX_Airport', 'AIRPORT', {}, [[{ORTH: 'LAX'}]])
-    EN.matcher.add('SFO_Airport', 'AIRPORT', {}, [[{ORTH: 'SFO'}]])
-    doc = EN(u'get me a flight from SFO to LAX leaving 20 December and arriving on January 5th')
+# @pytest.mark.models
+# def test_ner_interaction(EN):
+#     EN.matcher.add('LAX_Airport', 'AIRPORT', {}, [[{ORTH: 'LAX'}]])
+#     EN.matcher.add('SFO_Airport', 'AIRPORT', {}, [[{ORTH: 'SFO'}]])
+#     doc = EN(u'get me a flight from SFO to LAX leaving 20 December and arriving on January 5th')

-    ents = [(ent.label_, ent.text) for ent in doc.ents]
-    assert ents[0] == ('AIRPORT', 'SFO')
-    assert ents[1] == ('AIRPORT', 'LAX')
-    assert ents[2] == ('DATE', '20 December')
-    assert ents[3] == ('DATE', 'January 5th')
+#     ents = [(ent.label_, ent.text) for ent in doc.ents]
+#     assert ents[0] == ('AIRPORT', 'SFO')
+#     assert ents[1] == ('AIRPORT', 'LAX')
+#     assert ents[2] == ('DATE', '20 December')
+#     assert ents[3] == ('DATE', 'January 5th')
 
+
+# @pytest.mark.models
+# def test_ner_interaction(EN):
+#     # ensure that matcher doesn't overwrite annotations set by the NER model
+#     doc = EN.tokenizer.tokens_from_list(u'get me a flight from SFO to LAX leaving 20 December and arriving on January 5th'.split(' '))
+#     EN.tagger(doc)
+
+#     columns = [ENT_IOB, ENT_TYPE]
+#     values = numpy.ndarray(shape=(len(doc),len(columns)), dtype='int32')
+#     # IOB values are 0=missing, 1=I, 2=O, 3=B 
+#     iobs = [2,2,2,2,2,3,2,3,2,3,1,2,2,2,3,1]
+#     types = [0,0,0,0,0,LOC,0,LOC,0,DATE,DATE,0,0,0,DATE,DATE]
+#     values[:] = zip(iobs,types)
+#     doc.from_array(columns,values)
+
+#     assert doc[5].ent_type_ == 'LOC'
+#     assert doc[7].ent_type_ == 'LOC'
+#     assert doc[9].ent_type_ == 'DATE'
+#     assert doc[10].ent_type_ == 'DATE'
+#     assert doc[14].ent_type_ == 'DATE'
+#     assert doc[15].ent_type_ == 'DATE'
+
+#     EN.matcher.add('LAX_Airport', 'AIRPORT', {}, [[{ORTH: 'LAX'}]])
+#     EN.matcher.add('SFO_Airport', 'AIRPORT', {}, [[{ORTH: 'SFO'}]])
+#     EN.matcher(doc)
+
+#     assert doc[5].ent_type_ != 'AIRPORT'
+#     assert doc[7].ent_type_ != 'AIRPORT'
+#     assert doc[5].ent_type_ == 'LOC'
+#     assert doc[7].ent_type_ == 'LOC'
+#     assert doc[9].ent_type_ == 'DATE'
+#     assert doc[10].ent_type_ == 'DATE'
+#     assert doc[14].ent_type_ == 'DATE'
+#     assert doc[15].ent_type_ == 'DATE'
+
+
+
+
+
+
+
+
+
+
--- a/spacy/tests/parser/test_parse_navigate.py
+++ b/spacy/tests/parser/test_parse_navigate.py
@ -57,8 +57,7 @@ def test_child_consistency(EN, sun_text):


@pytest.mark.models
-def test_edges(EN):
-    sun_text = u"Chemically, about three quarters of the Sun's mass consists of hydrogen, while the rest is mostly helium."
+def test_edges(EN, sun_text):
    tokens = EN(sun_text)
    for token in tokens:
        subtree = list(token.subtree)
--- a/spacy/tests/tagger/test_add_lemmas.py
+++ b/spacy/tests/tagger/test_add_lemmas.py
@ -3,30 +3,10 @@ from __future__ import unicode_literals
 from spacy.en import English
 import pytest

-
-
-@pytest.fixture
-def tagged(EN):
-    string = u'Bananas in pyjamas are geese.'
-    tokens = EN(string, tag=True, parse=False)
-    return tokens
-
-
-@pytest.fixture
-def lemmas(tagged):
-    return [t.lemma_ for t in tagged]
-
-
@pytest.mark.models
-def test_lemmas(lemmas, tagged):
-    assert lemmas[0] == 'banana'
-    assert lemmas[1] == 'in'
-    assert lemmas[2] == 'pyjama'
-    assert lemmas[3] == 'be'
-    if tagged[2].tag == tagged[4].tag:
-        assert lemmas[4] == 'goose'
-
-
-def test_didnt(EN):
-    tokens = EN(u"I didn't do it")
-    assert tokens[1].lemma_ != u""
+def test_lemma_assignment(EN):
+    tokens = u'Bananas in pyjamas are geese .'.split(' ')
+    doc = EN.tokenizer.tokens_from_list(tokens)
+    assert all( t.lemma_ == u'' for t in doc )
+    EN.tagger(doc)
+    assert all( t.lemma_ != u'' for t in doc )
--- a/spacy/tests/tokens/test_tokens_api.py
+++ b/spacy/tests/tokens/test_tokens_api.py
@ -190,5 +190,5 @@ def test_right_edge(EN):
    token = doc[6]
    assert token.text == u'for'
    subtree = [w.text for w in token.subtree]
-    assert subtree == [u'for' , u'the', u'sake', u'of']
-    assert token.right_edge.text == u'of'
+    assert subtree == [u'for' , u'the', u'sake', u'of', u'such', u'as', u'live', u'under', u'the', u'government', u'of', u'the', u'Romans', u',']
+    assert token.right_edge.text == u','