Merge branch 'master' of ssh://github.com/spacy-io/spaCy

This commit is contained in:
Matthew Honnibal 2016-05-04 15:54:00 +02:00
commit 76f1d871da
8 changed files with 236 additions and 30 deletions

View File

@ -32,7 +32,10 @@ def german_noun_chunks(doc):
np_deps = set(doc.vocab.strings[label] for label in labels) np_deps = set(doc.vocab.strings[label] for label in labels)
close_app = doc.vocab.strings['nk'] close_app = doc.vocab.strings['nk']
for word in doc: rbracket = 0
for i, word in enumerate(doc):
if i < rbracket:
continue
if word.pos == NOUN and word.dep in np_deps: if word.pos == NOUN and word.dep in np_deps:
rbracket = word.i+1 rbracket = word.i+1
# try to extend the span to the right # try to extend the span to the right
@ -40,7 +43,7 @@ def german_noun_chunks(doc):
for rdep in doc[word.i].rights: for rdep in doc[word.i].rights:
if rdep.pos == NOUN and rdep.dep == close_app: if rdep.pos == NOUN and rdep.dep == close_app:
rbracket = rdep.i+1 rbracket = rdep.i+1
yield word.l_edge, rbracket, np_label yield word.left_edge.i, rbracket, np_label
CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks} CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks}

View File

@ -225,6 +225,11 @@ cdef class Parser:
def step_through(self, Doc doc): def step_through(self, Doc doc):
return StepwiseState(self, doc) return StepwiseState(self, doc)
def from_transition_sequence(self, Doc doc, sequence):
with self.step_through(doc) as stepwise:
for transition in sequence:
stepwise.transition(transition)
def add_label(self, label): def add_label(self, label):
for action in self.moves.action_types: for action in self.moves.action_types:
self.moves.add_action(action, label) self.moves.add_action(action, label)

View File

@ -1,17 +1,15 @@
from spacy.en import English
import pytest import pytest
import os import os
import spacy
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def EN(): def EN():
if os.environ.get('SPACY_DATA'): return spacy.load("en")
data_dir = os.environ.get('SPACY_DATA')
else: @pytest.fixture(scope="session")
data_dir = None def DE():
print("Load EN from %s" % data_dir) return spacy.load("de")
return English(data_dir=data_dir)
def pytest_addoption(parser): def pytest_addoption(parser):

View File

View File

@ -0,0 +1,62 @@
# -*- coding: utf-8 -*-
import pytest
import numpy
@pytest.mark.models
class TestModelSanity:
"""
This is to make sure the model works as expected. The tests make sure that values are properly set.
Tests are not meant to evaluate the content of the output, only make sure the output is formally okay.
"""
@pytest.fixture(scope='class', params=['en','de'])
def example(self, request, EN, DE):
if request.param == 'en':
return EN(u'There was a stranger standing at the big street talking to herself.')
elif request.param == 'de':
return DE(u'An der großen Straße stand eine merkwürdige Gestalt und führte Selbstgespräche.')
def test_tokenization(self, example):
# tokenization should split the document into tokens
assert len(example) > 1
def test_tagging(self, example):
# if tagging was done properly, pos tags shouldn't be empty
assert example.is_tagged
assert all( t.pos != 0 for t in example )
assert all( t.tag != 0 for t in example )
def test_parsing(self, example):
# if parsing was done properly
# - dependency labels shouldn't be empty
# - the head of some tokens should not be root
assert example.is_parsed
assert all( t.dep != 0 for t in example )
assert any( t.dep != i for i,t in enumerate(example) )
def test_ner(self, example):
# if ner was done properly, ent_iob shouldn't be empty
assert all( t.ent_iob != 0 for t in example )
def test_vectors(self, example):
# if vectors are available, they should differ on different words
# this isn't a perfect test since this could in principle fail in a sane model as well,
# but that's very unlikely and a good indicator if something is wrong
vector0 = example[0].vector
vector1 = example[1].vector
vector2 = example[2].vector
assert not numpy.array_equal(vector0,vector1)
assert not numpy.array_equal(vector0,vector2)
assert not numpy.array_equal(vector1,vector2)
def test_probs(self, example):
# if frequencies/probabilities are okay, they should differ for different words
# this isn't a perfect test since this could in principle fail in a sane model as well,
# but that's very unlikely and a good indicator if something is wrong
prob0 = example[0].prob
prob1 = example[1].prob
prob2 = example[2].prob
assert not prob0 == prob1
assert not prob0 == prob2
assert not prob1 == prob2

View File

@ -2,30 +2,30 @@ from __future__ import unicode_literals
import pytest import pytest
@pytest.mark.models # @pytest.mark.models
def test_nsubj(EN): # def test_nsubj(EN):
sent = EN(u'A base phrase should be recognized.') # sent = EN(u'A base phrase should be recognized.')
base_nps = list(sent.noun_chunks) # base_nps = list(sent.noun_chunks)
assert len(base_nps) == 1 # assert len(base_nps) == 1
assert base_nps[0].string == 'A base phrase ' # assert base_nps[0].string == 'A base phrase '
@pytest.mark.models # @pytest.mark.models
def test_coord(EN): # def test_coord(EN):
sent = EN(u'A base phrase and a good phrase are often the same.') # sent = EN(u'A base phrase and a good phrase are often the same.')
base_nps = list(sent.noun_chunks) # base_nps = list(sent.noun_chunks)
assert len(base_nps) == 2 # assert len(base_nps) == 2
assert base_nps[0].string == 'A base phrase ' # assert base_nps[0].string == 'A base phrase '
assert base_nps[1].string == 'a good phrase ' # assert base_nps[1].string == 'a good phrase '
@pytest.mark.models # @pytest.mark.models
def test_pp(EN): # def test_pp(EN):
sent = EN(u'A phrase with another phrase occurs') # sent = EN(u'A phrase with another phrase occurs')
base_nps = list(sent.noun_chunks) # base_nps = list(sent.noun_chunks)
assert len(base_nps) == 2 # assert len(base_nps) == 2
assert base_nps[0].string == 'A phrase ' # assert base_nps[0].string == 'A phrase '
assert base_nps[1].string == 'another phrase ' # assert base_nps[1].string == 'another phrase '
@pytest.mark.models @pytest.mark.models

View File

View File

@ -0,0 +1,138 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import pytest
import numpy
from spacy.attrs import HEAD, DEP
@pytest.mark.models
class TestNounChunks:
@pytest.fixture(scope="class")
def ex1_en(self, EN):
example = EN.tokenizer.tokens_from_list('A base phrase should be recognized .'.split(' '))
EN.tagger.tag_from_strings(example, 'DT NN NN MD VB VBN .'.split(' '))
det,compound,nsubjpass,aux,auxpass,root,punct = tuple( EN.vocab.strings[l] for l in ['det','compound','nsubjpass','aux','auxpass','root','punct'] )
example.from_array([HEAD, DEP],
numpy.asarray(
[
[2, det],
[1, compound],
[3, nsubjpass],
[2, aux],
[1, auxpass],
[0, root],
[-1, punct]
], dtype='int32'))
return example
@pytest.fixture(scope="class")
def ex2_en(self, EN):
example = EN.tokenizer.tokens_from_list('A base phrase and a good phrase are often the same .'.split(' '))
EN.tagger.tag_from_strings(example, 'DT NN NN CC DT JJ NN VBP RB DT JJ .'.split(' '))
det,compound,nsubj,cc,amod,conj,root,advmod,attr,punct = tuple( EN.vocab.strings[l] for l in ['det','compound','nsubj','cc','amod','conj','root','advmod','attr','punct'] )
example.from_array([HEAD, DEP],
numpy.asarray(
[
[2, det],
[1, compound],
[5, nsubj],
[-1, cc],
[1, det],
[1, amod],
[-4, conj],
[0, root],
[-1, advmod],
[1, det],
[-3, attr],
[-4, punct]
], dtype='int32'))
return example
@pytest.fixture(scope="class")
def ex3_en(self, EN):
example = EN.tokenizer.tokens_from_list('A phrase with another phrase occurs .'.split(' '))
EN.tagger.tag_from_strings(example, 'DT NN IN DT NN VBZ .'.split(' '))
det,nsubj,prep,pobj,root,punct = tuple( EN.vocab.strings[l] for l in ['det','nsubj','prep','pobj','root','punct'] )
example.from_array([HEAD, DEP],
numpy.asarray(
[
[1, det],
[4, nsubj],
[-1, prep],
[1, det],
[-2, pobj],
[0, root],
[-1, punct]
], dtype='int32'))
return example
@pytest.fixture(scope="class")
def ex1_de(self, DE):
example = DE.tokenizer.tokens_from_list('Eine Tasse steht auf dem Tisch .'.split(' '))
DE.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN $.'.split(' '))
nk,sb,root,mo,punct = tuple( DE.vocab.strings[l] for l in ['nk','sb','root','mo','punct'])
example.from_array([HEAD, DEP],
numpy.asarray(
[
[1, nk],
[1, sb],
[0, root],
[-1, mo],
[1, nk],
[-2, nk],
[-3, punct]
], dtype='int32'))
return example
@pytest.fixture(scope="class")
def ex2_de(self, DE):
example = DE.tokenizer.tokens_from_list('Die Sängerin singt mit einer Tasse Kaffee Arien .'.split(' '))
DE.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN NN NN $.'.split(' '))
nk,sb,root,mo,punct,oa = tuple( DE.vocab.strings[l] for l in ['nk','sb','root','mo','punct','oa'])
example.from_array([HEAD, DEP],
numpy.asarray(
[
[1, nk],
[1, sb],
[0, root],
[-1, mo],
[1, nk],
[-2, nk],
[-1, nk],
[-5, oa],
[-6, punct]
], dtype='int32'))
return example
def test_en_standard_chunk(self, ex1_en):
chunks = list(ex1_en.noun_chunks)
assert len(chunks) == 1
assert chunks[0].string == 'A base phrase '
def test_en_coordinated_chunks(self, ex2_en):
chunks = list(ex2_en.noun_chunks)
assert len(chunks) == 2
assert chunks[0].string == 'A base phrase '
assert chunks[1].string == 'a good phrase '
def test_en_pp_chunks(self, ex3_en):
chunks = list(ex3_en.noun_chunks)
assert len(chunks) == 2
assert chunks[0].string == 'A phrase '
assert chunks[1].string == 'another phrase '
def test_de_standard_chunk(self, ex1_de):
chunks = list(ex1_de.noun_chunks)
assert len(chunks) == 2
assert chunks[0].string == 'Eine Tasse '
assert chunks[1].string == 'dem Tisch '
def test_de_extended_chunk(self, ex2_de):
chunks = list(ex2_de.noun_chunks)
assert len(chunks) == 3
assert chunks[0].string == 'Die Sängerin '
assert chunks[1].string == 'einer Tasse Kaffee '
assert chunks[2].string == 'Arien '