Merge branch 'master' of ssh://github.com/spacy-io/spaCy

2025-07-07 21:33:13 +03:00 · 2016-05-04 15:54:00 +02:00 · 2016-05-04 15:54:00 +02:00 · 76f1d871da
commit 76f1d871da
parent 519366f677 1822bb4ff1
8 changed files with 236 additions and 30 deletions
--- a/spacy/syntax/iterators.pyx
+++ b/spacy/syntax/iterators.pyx
@ -32,7 +32,10 @@ def german_noun_chunks(doc):
    np_deps = set(doc.vocab.strings[label] for label in labels)
    close_app = doc.vocab.strings['nk']
-    for word in doc:
+    rbracket = 0
    for i, word in enumerate(doc):
        if i < rbracket:
            continue
        if word.pos == NOUN and word.dep in np_deps:
            rbracket = word.i+1
            # try to extend the span to the right
@ -40,7 +43,7 @@ def german_noun_chunks(doc):
            for rdep in doc[word.i].rights:
                if rdep.pos == NOUN and rdep.dep == close_app:
                    rbracket = rdep.i+1
-            yield word.l_edge, rbracket, np_label
+            yield word.left_edge.i, rbracket, np_label
 CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks}
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@ -225,6 +225,11 @@ cdef class Parser:
    def step_through(self, Doc doc):
        return StepwiseState(self, doc)
    def from_transition_sequence(self, Doc doc, sequence):
        with self.step_through(doc) as stepwise:
            for transition in sequence:
                stepwise.transition(transition)
    def add_label(self, label):
        for action in self.moves.action_types:
            self.moves.add_action(action, label)
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -1,17 +1,15 @@
 from spacy.en import English
 import pytest
 import os
 import spacy
@pytest.fixture(scope="session")
 def EN():
-    if os.environ.get('SPACY_DATA'):
+    return spacy.load("en")
-        data_dir = os.environ.get('SPACY_DATA')
+
-    else:
+@pytest.fixture(scope="session")
-        data_dir = None
+def DE():
-    print("Load EN from %s" % data_dir)
+    return spacy.load("de")
    return English(data_dir=data_dir)
 def pytest_addoption(parser):
--- a/spacy/tests/integration/init.py
+++ b/spacy/tests/integration/init.py
--- a/spacy/tests/integration/test_model_sanity.py
+++ b/spacy/tests/integration/test_model_sanity.py
@ -0,0 +1,62 @@
 # -*- coding: utf-8 -*-
 import pytest
 import numpy
@pytest.mark.models
 class TestModelSanity:
 	"""
 	This is to make sure the model works as expected. The tests make sure that values are properly set.
 	Tests are not meant to evaluate the content of the output, only make sure the output is formally okay.
 	"""
 	@pytest.fixture(scope='class', params=['en','de'])
 	def example(self, request, EN, DE):
 		if request.param == 'en':
 			return EN(u'There was a stranger standing at the big street talking to herself.')
 		elif request.param == 'de':
 			return DE(u'An der großen Straße stand eine merkwürdige Gestalt und führte Selbstgespräche.')
 	def test_tokenization(self, example):
 		# tokenization should split the document into tokens
 		assert len(example) > 1
 	def test_tagging(self, example):
 		# if tagging was done properly, pos tags shouldn't be empty
 		assert example.is_tagged
 		assert all( t.pos != 0 for t in example )
 		assert all( t.tag != 0 for t in example )
 	def test_parsing(self, example):
 		# if parsing was done properly
 		# - dependency labels shouldn't be empty
 		# - the head of some tokens should not be root
 		assert example.is_parsed
 		assert all( t.dep != 0 for t in example )
 		assert any( t.dep != i for i,t in enumerate(example) )
 	def test_ner(self, example):
 		# if ner was done properly, ent_iob shouldn't be empty
 		assert all( t.ent_iob != 0 for t in example )
 	def test_vectors(self, example):
 		# if vectors are available, they should differ on different words
 		# this isn't a perfect test since this could in principle fail in a sane model as well,
 		# but that's very unlikely and a good indicator if something is wrong
 		vector0 = example[0].vector
 		vector1 = example[1].vector
 		vector2 = example[2].vector
 		assert not numpy.array_equal(vector0,vector1)
 		assert not numpy.array_equal(vector0,vector2)
 		assert not numpy.array_equal(vector1,vector2)
 	def test_probs(self, example):
 		# if frequencies/probabilities are okay, they should differ for different words
 		# this isn't a perfect test since this could in principle fail in a sane model as well,
 		# but that's very unlikely and a good indicator if something is wrong
 		prob0 = example[0].prob
 		prob1 = example[1].prob
 		prob2 = example[2].prob
 		assert not prob0 == prob1
 		assert not prob0 == prob2
 		assert not prob1 == prob2
--- a/spacy/tests/parser/test_base_nps.py
+++ b/spacy/tests/parser/test_base_nps.py
@ -2,30 +2,30 @@ from __future__ import unicode_literals
 import pytest
-@pytest.mark.models
+# @pytest.mark.models
-def test_nsubj(EN):
+# def test_nsubj(EN):
-    sent = EN(u'A base phrase should be recognized.')
+#     sent = EN(u'A base phrase should be recognized.')
-    base_nps = list(sent.noun_chunks)
+#     base_nps = list(sent.noun_chunks)
-    assert len(base_nps) == 1
+#     assert len(base_nps) == 1
-    assert base_nps[0].string == 'A base phrase '
+#     assert base_nps[0].string == 'A base phrase '
-@pytest.mark.models
+# @pytest.mark.models
-def test_coord(EN):
+# def test_coord(EN):
-    sent = EN(u'A base phrase and a good phrase are often the same.')
+#     sent = EN(u'A base phrase and a good phrase are often the same.')
-    base_nps = list(sent.noun_chunks)
+#     base_nps = list(sent.noun_chunks)
-    assert len(base_nps) == 2
+#     assert len(base_nps) == 2
-    assert base_nps[0].string == 'A base phrase '
+#     assert base_nps[0].string == 'A base phrase '
-    assert base_nps[1].string == 'a good phrase '
+#     assert base_nps[1].string == 'a good phrase '
-@pytest.mark.models
+# @pytest.mark.models
-def test_pp(EN):
+# def test_pp(EN):
-    sent = EN(u'A phrase with another phrase occurs')
+#     sent = EN(u'A phrase with another phrase occurs')
-    base_nps = list(sent.noun_chunks)
+#     base_nps = list(sent.noun_chunks)
-    assert len(base_nps) == 2
+#     assert len(base_nps) == 2
-    assert base_nps[0].string == 'A phrase '
+#     assert base_nps[0].string == 'A phrase '
-    assert base_nps[1].string == 'another phrase ' 
+#     assert base_nps[1].string == 'another phrase ' 
@pytest.mark.models
--- a/spacy/tests/unit/init.py
+++ b/spacy/tests/unit/init.py
--- a/spacy/tests/unit/test_parser.py
+++ b/spacy/tests/unit/test_parser.py
@ -0,0 +1,138 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 import pytest
 import numpy
 from spacy.attrs import HEAD, DEP
@pytest.mark.models
 class TestNounChunks:
    @pytest.fixture(scope="class")
    def ex1_en(self, EN):
        example = EN.tokenizer.tokens_from_list('A base phrase should be recognized .'.split(' '))
        EN.tagger.tag_from_strings(example, 'DT NN NN MD VB VBN .'.split(' '))
        det,compound,nsubjpass,aux,auxpass,root,punct = tuple( EN.vocab.strings[l] for l in ['det','compound','nsubjpass','aux','auxpass','root','punct'] )
        example.from_array([HEAD, DEP],
        numpy.asarray(
            [
                [2, det],
                [1, compound],
                [3, nsubjpass],
                [2, aux],
                [1, auxpass],
                [0, root],
                [-1, punct]
            ], dtype='int32'))
        return example
    @pytest.fixture(scope="class")
    def ex2_en(self, EN):
        example = EN.tokenizer.tokens_from_list('A base phrase and a good phrase are often the same .'.split(' '))
        EN.tagger.tag_from_strings(example, 'DT NN NN CC DT JJ NN VBP RB DT JJ .'.split(' '))
        det,compound,nsubj,cc,amod,conj,root,advmod,attr,punct = tuple( EN.vocab.strings[l] for l in ['det','compound','nsubj','cc','amod','conj','root','advmod','attr','punct'] )
        example.from_array([HEAD, DEP],
        numpy.asarray(
            [
                [2, det],
                [1, compound],
                [5, nsubj],
                [-1, cc],
                [1, det],
                [1, amod],
                [-4, conj],
                [0, root],
                [-1, advmod],
                [1, det],
                [-3, attr],
                [-4, punct]
            ], dtype='int32'))
        return example
    @pytest.fixture(scope="class")
    def ex3_en(self, EN):
        example = EN.tokenizer.tokens_from_list('A phrase with another phrase occurs .'.split(' '))
        EN.tagger.tag_from_strings(example, 'DT NN IN DT NN VBZ .'.split(' '))
        det,nsubj,prep,pobj,root,punct = tuple( EN.vocab.strings[l] for l in ['det','nsubj','prep','pobj','root','punct'] )
        example.from_array([HEAD, DEP],
        numpy.asarray(
            [
                [1, det],
                [4, nsubj],
                [-1, prep],
                [1, det],
                [-2, pobj],
                [0, root],
                [-1, punct]
            ], dtype='int32'))
        return example
    @pytest.fixture(scope="class")
    def ex1_de(self, DE):
        example = DE.tokenizer.tokens_from_list('Eine Tasse steht auf dem Tisch .'.split(' '))
        DE.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN $.'.split(' '))
        nk,sb,root,mo,punct = tuple( DE.vocab.strings[l] for l in ['nk','sb','root','mo','punct'])
        example.from_array([HEAD, DEP],
        numpy.asarray(
            [
                [1, nk],
                [1, sb],
                [0, root],
                [-1, mo],
                [1, nk],
                [-2, nk],
                [-3, punct]
            ], dtype='int32'))
        return example
    @pytest.fixture(scope="class")
    def ex2_de(self, DE):
        example = DE.tokenizer.tokens_from_list('Die Sängerin singt mit einer Tasse Kaffee Arien .'.split(' '))
        DE.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN NN NN $.'.split(' '))
        nk,sb,root,mo,punct,oa = tuple( DE.vocab.strings[l] for l in ['nk','sb','root','mo','punct','oa'])
        example.from_array([HEAD, DEP],
        numpy.asarray(
            [
                [1, nk],
                [1, sb],
                [0, root],
                [-1, mo],
                [1, nk],
                [-2, nk],
                [-1, nk],
                [-5, oa],
                [-6, punct]
            ], dtype='int32'))
        return example
    def test_en_standard_chunk(self, ex1_en):
        chunks = list(ex1_en.noun_chunks)
        assert len(chunks) == 1
        assert chunks[0].string == 'A base phrase '
    def test_en_coordinated_chunks(self, ex2_en):
        chunks = list(ex2_en.noun_chunks)
        assert len(chunks) == 2
        assert chunks[0].string == 'A base phrase '
        assert chunks[1].string == 'a good phrase '
    def test_en_pp_chunks(self, ex3_en):
        chunks = list(ex3_en.noun_chunks)
        assert len(chunks) == 2
        assert chunks[0].string == 'A phrase '
        assert chunks[1].string == 'another phrase '
    def test_de_standard_chunk(self, ex1_de):
        chunks = list(ex1_de.noun_chunks)
        assert len(chunks) == 2
        assert chunks[0].string == 'Eine Tasse '
        assert chunks[1].string == 'dem Tisch '
    def test_de_extended_chunk(self, ex2_de):
        chunks = list(ex2_de.noun_chunks)
        assert len(chunks) == 3
        assert chunks[0].string == 'Die Sängerin '
        assert chunks[1].string == 'einer Tasse Kaffee '
        assert chunks[2].string == 'Arien '