reformulate noun chunk tests for English

2025-08-09 06:34:54 +03:00 · 2016-05-03 14:24:35 +02:00 · 2016-05-03 14:24:35 +02:00 · 7b246c13cb
commit 7b246c13cb
parent 1786331cd8
7 changed files with 111 additions and 21 deletions
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@ -382,6 +382,7 @@ cpdef enum symbol_t:
    cc
    ccomp
    complm
+    compound
    conj
    csubj
    csubjpass
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@ -381,6 +381,7 @@ IDS = {
    "cc": cc,
    "ccomp": ccomp,
    "complm": complm,
+    "compound": compound,
    "conj": conj,
    "csubj": csubj,
    "csubjpass": csubjpass,
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@ -225,6 +225,11 @@ cdef class Parser:
    def step_through(self, Doc doc):
        return StepwiseState(self, doc)

+    def from_transition_sequence(self, Doc doc, sequence):
+        with self.step_through(doc) as stepwise:
+            for transition in sequence:
+                stepwise.transition(transition)
+
    def add_label(self, label):
        for action in self.moves.action_types:
            self.moves.add_action(action, label)
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -7,7 +7,7 @@ import spacy
 def EN():
    return spacy.load("en")

-@pytest.fixture(score="session")
+@pytest.fixture(scope="session")
 def DE():
    return spacy.load("de")

--- a/spacy/tests/parser/test_base_nps.py
+++ b/spacy/tests/parser/test_base_nps.py
@ -2,30 +2,30 @@ from __future__ import unicode_literals
 import pytest


-@pytest.mark.models
-def test_nsubj(EN):
-    sent = EN(u'A base phrase should be recognized.')
-    base_nps = list(sent.noun_chunks)
-    assert len(base_nps) == 1
-    assert base_nps[0].string == 'A base phrase '
+# @pytest.mark.models
+# def test_nsubj(EN):
+#     sent = EN(u'A base phrase should be recognized.')
+#     base_nps = list(sent.noun_chunks)
+#     assert len(base_nps) == 1
+#     assert base_nps[0].string == 'A base phrase '


-@pytest.mark.models
-def test_coord(EN):
-    sent = EN(u'A base phrase and a good phrase are often the same.')
-    base_nps = list(sent.noun_chunks)
-    assert len(base_nps) == 2
-    assert base_nps[0].string == 'A base phrase '
-    assert base_nps[1].string == 'a good phrase '
+# @pytest.mark.models
+# def test_coord(EN):
+#     sent = EN(u'A base phrase and a good phrase are often the same.')
+#     base_nps = list(sent.noun_chunks)
+#     assert len(base_nps) == 2
+#     assert base_nps[0].string == 'A base phrase '
+#     assert base_nps[1].string == 'a good phrase '


-@pytest.mark.models
-def test_pp(EN):
-    sent = EN(u'A phrase with another phrase occurs')
-    base_nps = list(sent.noun_chunks)
-    assert len(base_nps) == 2
-    assert base_nps[0].string == 'A phrase '
-    assert base_nps[1].string == 'another phrase ' 
+# @pytest.mark.models
+# def test_pp(EN):
+#     sent = EN(u'A phrase with another phrase occurs')
+#     base_nps = list(sent.noun_chunks)
+#     assert len(base_nps) == 2
+#     assert base_nps[0].string == 'A phrase '
+#     assert base_nps[1].string == 'another phrase ' 


@pytest.mark.models
--- a/spacy/tests/unit/init.py
+++ b/spacy/tests/unit/init.py
--- a/spacy/tests/unit/test_parser.py
+++ b/spacy/tests/unit/test_parser.py
@ -0,0 +1,83 @@
+from __future__ import unicode_literals
+
+import pytest
+import numpy
+
+from spacy.attrs import HEAD, DEP
+from spacy.symbols import root, det, compound, nsubjpass, aux, auxpass, punct, nsubj, cc, amod, conj, advmod, attr, prep, pobj
+
+
+@pytest.mark.models
+class TestNounChunks:
+    @pytest.fixture(scope="class")
+    def ex1_en(self, EN):
+        example = EN.tokenizer.tokens_from_list('A base phrase should be recognized .'.split(' '))
+        EN.tagger.tag_from_strings(example, 'DT NN NN MD VB VBN .'.split(' '))
+        example.from_array([HEAD, DEP],
+        numpy.asarray(
+            [
+                [2, det],
+                [1, compound],
+                [3, nsubjpass],
+                [2, aux],
+                [1, auxpass],
+                [0, root],
+                [-1, punct]
+            ], dtype='int32'))
+        return example
+
+    @pytest.fixture(scope="class")
+    def ex2_en(self, EN):
+        example = EN.tokenizer.tokens_from_list('A base phrase and a good phrase are often the same .'.split(' '))
+        EN.tagger.tag_from_strings(example, 'DT NN NN CC DT JJ NN VBP RB DT JJ .'.split(' '))
+        example.from_array([HEAD, DEP],
+        numpy.asarray(
+            [
+                [2, det],
+                [1, compound],
+                [5, nsubj],
+                [-1, cc],
+                [1, det],
+                [1, amod],
+                [-4, conj],
+                [0, root],
+                [-1, advmod],
+                [1, det],
+                [-3, attr],
+                [-4, punct]
+            ], dtype='int32'))
+        return example
+
+    @pytest.fixture(scope="class")
+    def ex3_en(self, EN):
+        example = EN.tokenizer.tokens_from_list('A phrase with another phrase occurs .'.split(' '))
+        EN.tagger.tag_from_strings(example, 'DT NN IN DT NN VBZ .'.split(' '))
+        example.from_array([HEAD, DEP],
+        numpy.asarray(
+            [
+                [1, det],
+                [4, nsubj],
+                [-1, prep],
+                [1, det],
+                [-2, pobj],
+                [0, root],
+                [-1, punct]
+            ], dtype='int32'))
+        return example
+
+    def test_standard_chunk(self, ex1_en):
+        chunks = list(ex1_en.noun_chunks)
+        assert len(chunks) == 1
+        assert chunks[0].string == 'A base phrase '
+
+    def test_coordinated_chunks(self, ex2_en):
+        chunks = list(ex2_en.noun_chunks)
+        assert len(chunks) == 2
+        assert chunks[0].string == 'A base phrase '
+        assert chunks[1].string == 'a good phrase '
+
+    def test_pp_chunks(self, ex3_en):
+        chunks = list(ex3_en.noun_chunks)
+        assert len(chunks) == 2
+        assert chunks[0].string == 'A phrase '
+        assert chunks[1].string == 'another phrase '