From 1786331cd82674a5d4ec14cce74d135278dae84d Mon Sep 17 00:00:00 2001 From: Wolfgang Seeker Date: Tue, 3 May 2016 12:51:47 +0200 Subject: [PATCH 1/7] add model sanity test --- spacy/tests/conftest.py | 14 ++--- spacy/tests/integration/__init__.py | 0 spacy/tests/integration/test_model_sanity.py | 62 ++++++++++++++++++++ 3 files changed, 68 insertions(+), 8 deletions(-) create mode 100644 spacy/tests/integration/__init__.py create mode 100644 spacy/tests/integration/test_model_sanity.py diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 83a39a03a..cf7fd223a 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -1,17 +1,15 @@ -from spacy.en import English - import pytest import os +import spacy @pytest.fixture(scope="session") def EN(): - if os.environ.get('SPACY_DATA'): - data_dir = os.environ.get('SPACY_DATA') - else: - data_dir = None - print("Load EN from %s" % data_dir) - return English(data_dir=data_dir) + return spacy.load("en") + +@pytest.fixture(score="session") +def DE(): + return spacy.load("de") def pytest_addoption(parser): diff --git a/spacy/tests/integration/__init__.py b/spacy/tests/integration/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/integration/test_model_sanity.py b/spacy/tests/integration/test_model_sanity.py new file mode 100644 index 000000000..0cddb85dd --- /dev/null +++ b/spacy/tests/integration/test_model_sanity.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- + +import pytest +import numpy + +@pytest.mark.models +class TestModelSanity: + """ + This is to make sure the model works as expected. The tests make sure that values are properly set. + Tests are not meant to evaluate the content of the output, only make sure the output is formally okay. + """ + + @pytest.fixture(scope='class', params=['en','de']) + def example(self, request, EN, DE): + if request.param == 'en': + return EN(u'There was a stranger standing at the big street talking to herself.') + elif request.param == 'de': + return DE(u'An der großen Straße stand eine merkwürdige Gestalt und führte Selbstgespräche.') + + def test_tokenization(self, example): + # tokenization should split the document into tokens + assert len(example) > 1 + + def test_tagging(self, example): + # if tagging was done properly, pos tags shouldn't be empty + assert example.is_tagged + assert all( t.pos != 0 for t in example ) + assert all( t.tag != 0 for t in example ) + + def test_parsing(self, example): + # if parsing was done properly + # - dependency labels shouldn't be empty + # - the head of some tokens should not be root + assert example.is_parsed + assert all( t.dep != 0 for t in example ) + assert any( t.dep != i for i,t in enumerate(example) ) + + def test_ner(self, example): + # if ner was done properly, ent_iob shouldn't be empty + assert all( t.ent_iob != 0 for t in example ) + + def test_vectors(self, example): + # if vectors are available, they should differ on different words + # this isn't a perfect test since this could in principle fail in a sane model as well, + # but that's very unlikely and a good indicator if something is wrong + vector0 = example[0].vector + vector1 = example[1].vector + vector2 = example[2].vector + assert not numpy.array_equal(vector0,vector1) + assert not numpy.array_equal(vector0,vector2) + assert not numpy.array_equal(vector1,vector2) + + def test_probs(self, example): + # if frequencies/probabilities are okay, they should differ for different words + # this isn't a perfect test since this could in principle fail in a sane model as well, + # but that's very unlikely and a good indicator if something is wrong + prob0 = example[0].prob + prob1 = example[1].prob + prob2 = example[2].prob + assert not prob0 == prob1 + assert not prob0 == prob2 + assert not prob1 == prob2 From 7b246c13cbe58946cf75b8d860db80bf22963a93 Mon Sep 17 00:00:00 2001 From: Wolfgang Seeker Date: Tue, 3 May 2016 14:24:35 +0200 Subject: [PATCH 2/7] reformulate noun chunk tests for English --- spacy/symbols.pxd | 1 + spacy/symbols.pyx | 1 + spacy/syntax/parser.pyx | 5 ++ spacy/tests/conftest.py | 2 +- spacy/tests/parser/test_base_nps.py | 40 +++++++------- spacy/tests/unit/__init__.py | 0 spacy/tests/unit/test_parser.py | 83 +++++++++++++++++++++++++++++ 7 files changed, 111 insertions(+), 21 deletions(-) create mode 100644 spacy/tests/unit/__init__.py create mode 100644 spacy/tests/unit/test_parser.py diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index 942d8aa9c..d577eaf6d 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -382,6 +382,7 @@ cpdef enum symbol_t: cc ccomp complm + compound conj csubj csubjpass diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index 712bef9a3..0e8dcda13 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -381,6 +381,7 @@ IDS = { "cc": cc, "ccomp": ccomp, "complm": complm, + "compound": compound, "conj": conj, "csubj": csubj, "csubjpass": csubjpass, diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 04f9d5f22..22f37127a 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -225,6 +225,11 @@ cdef class Parser: def step_through(self, Doc doc): return StepwiseState(self, doc) + def from_transition_sequence(self, Doc doc, sequence): + with self.step_through(doc) as stepwise: + for transition in sequence: + stepwise.transition(transition) + def add_label(self, label): for action in self.moves.action_types: self.moves.add_action(action, label) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index cf7fd223a..cc64ee46f 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -7,7 +7,7 @@ import spacy def EN(): return spacy.load("en") -@pytest.fixture(score="session") +@pytest.fixture(scope="session") def DE(): return spacy.load("de") diff --git a/spacy/tests/parser/test_base_nps.py b/spacy/tests/parser/test_base_nps.py index 8d308bc8d..b598158d0 100644 --- a/spacy/tests/parser/test_base_nps.py +++ b/spacy/tests/parser/test_base_nps.py @@ -2,30 +2,30 @@ from __future__ import unicode_literals import pytest -@pytest.mark.models -def test_nsubj(EN): - sent = EN(u'A base phrase should be recognized.') - base_nps = list(sent.noun_chunks) - assert len(base_nps) == 1 - assert base_nps[0].string == 'A base phrase ' +# @pytest.mark.models +# def test_nsubj(EN): +# sent = EN(u'A base phrase should be recognized.') +# base_nps = list(sent.noun_chunks) +# assert len(base_nps) == 1 +# assert base_nps[0].string == 'A base phrase ' -@pytest.mark.models -def test_coord(EN): - sent = EN(u'A base phrase and a good phrase are often the same.') - base_nps = list(sent.noun_chunks) - assert len(base_nps) == 2 - assert base_nps[0].string == 'A base phrase ' - assert base_nps[1].string == 'a good phrase ' +# @pytest.mark.models +# def test_coord(EN): +# sent = EN(u'A base phrase and a good phrase are often the same.') +# base_nps = list(sent.noun_chunks) +# assert len(base_nps) == 2 +# assert base_nps[0].string == 'A base phrase ' +# assert base_nps[1].string == 'a good phrase ' -@pytest.mark.models -def test_pp(EN): - sent = EN(u'A phrase with another phrase occurs') - base_nps = list(sent.noun_chunks) - assert len(base_nps) == 2 - assert base_nps[0].string == 'A phrase ' - assert base_nps[1].string == 'another phrase ' +# @pytest.mark.models +# def test_pp(EN): +# sent = EN(u'A phrase with another phrase occurs') +# base_nps = list(sent.noun_chunks) +# assert len(base_nps) == 2 +# assert base_nps[0].string == 'A phrase ' +# assert base_nps[1].string == 'another phrase ' @pytest.mark.models diff --git a/spacy/tests/unit/__init__.py b/spacy/tests/unit/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/unit/test_parser.py b/spacy/tests/unit/test_parser.py new file mode 100644 index 000000000..7fdb663ed --- /dev/null +++ b/spacy/tests/unit/test_parser.py @@ -0,0 +1,83 @@ +from __future__ import unicode_literals + +import pytest +import numpy + +from spacy.attrs import HEAD, DEP +from spacy.symbols import root, det, compound, nsubjpass, aux, auxpass, punct, nsubj, cc, amod, conj, advmod, attr, prep, pobj + + +@pytest.mark.models +class TestNounChunks: + @pytest.fixture(scope="class") + def ex1_en(self, EN): + example = EN.tokenizer.tokens_from_list('A base phrase should be recognized .'.split(' ')) + EN.tagger.tag_from_strings(example, 'DT NN NN MD VB VBN .'.split(' ')) + example.from_array([HEAD, DEP], + numpy.asarray( + [ + [2, det], + [1, compound], + [3, nsubjpass], + [2, aux], + [1, auxpass], + [0, root], + [-1, punct] + ], dtype='int32')) + return example + + @pytest.fixture(scope="class") + def ex2_en(self, EN): + example = EN.tokenizer.tokens_from_list('A base phrase and a good phrase are often the same .'.split(' ')) + EN.tagger.tag_from_strings(example, 'DT NN NN CC DT JJ NN VBP RB DT JJ .'.split(' ')) + example.from_array([HEAD, DEP], + numpy.asarray( + [ + [2, det], + [1, compound], + [5, nsubj], + [-1, cc], + [1, det], + [1, amod], + [-4, conj], + [0, root], + [-1, advmod], + [1, det], + [-3, attr], + [-4, punct] + ], dtype='int32')) + return example + + @pytest.fixture(scope="class") + def ex3_en(self, EN): + example = EN.tokenizer.tokens_from_list('A phrase with another phrase occurs .'.split(' ')) + EN.tagger.tag_from_strings(example, 'DT NN IN DT NN VBZ .'.split(' ')) + example.from_array([HEAD, DEP], + numpy.asarray( + [ + [1, det], + [4, nsubj], + [-1, prep], + [1, det], + [-2, pobj], + [0, root], + [-1, punct] + ], dtype='int32')) + return example + + def test_standard_chunk(self, ex1_en): + chunks = list(ex1_en.noun_chunks) + assert len(chunks) == 1 + assert chunks[0].string == 'A base phrase ' + + def test_coordinated_chunks(self, ex2_en): + chunks = list(ex2_en.noun_chunks) + assert len(chunks) == 2 + assert chunks[0].string == 'A base phrase ' + assert chunks[1].string == 'a good phrase ' + + def test_pp_chunks(self, ex3_en): + chunks = list(ex3_en.noun_chunks) + assert len(chunks) == 2 + assert chunks[0].string == 'A phrase ' + assert chunks[1].string == 'another phrase ' From 7825b7554813c83e0423cec48ba293959d006840 Mon Sep 17 00:00:00 2001 From: Wolfgang Seeker Date: Tue, 3 May 2016 15:01:28 +0200 Subject: [PATCH 3/7] add tests for German noun chunker --- spacy/tests/unit/test_parser.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/spacy/tests/unit/test_parser.py b/spacy/tests/unit/test_parser.py index 7fdb663ed..ba224b9ec 100644 --- a/spacy/tests/unit/test_parser.py +++ b/spacy/tests/unit/test_parser.py @@ -65,18 +65,35 @@ class TestNounChunks: ], dtype='int32')) return example - def test_standard_chunk(self, ex1_en): + # @pytest.fixture(score="class") + # def ex1_de(self, DE): + # example = EN.tokenizer.tokens_from_list('Eine Tasse steht auf dem Tisch .'.split(' ')) + # EN.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN $.'.split(' ')) + # example.from_array([HEAD, DEP], + # numpy.asarray( + # [ + # [1, det], + # [4, nsubj], + # [-1, prep], + # [1, det], + # [-2, pobj], + # [0, root], + # [-1, punct] + # ], dtype='int32')) + # return example + + def test_en_standard_chunk(self, ex1_en): chunks = list(ex1_en.noun_chunks) assert len(chunks) == 1 assert chunks[0].string == 'A base phrase ' - def test_coordinated_chunks(self, ex2_en): + def test_en_coordinated_chunks(self, ex2_en): chunks = list(ex2_en.noun_chunks) assert len(chunks) == 2 assert chunks[0].string == 'A base phrase ' assert chunks[1].string == 'a good phrase ' - def test_pp_chunks(self, ex3_en): + def test_en_pp_chunks(self, ex3_en): chunks = list(ex3_en.noun_chunks) assert len(chunks) == 2 assert chunks[0].string == 'A phrase ' From fd8019ec92952f38aa65dcdf07dd5003eb0bebef Mon Sep 17 00:00:00 2001 From: Wolfgang Seeker Date: Tue, 3 May 2016 15:53:30 +0200 Subject: [PATCH 4/7] fix typo in german_noun_chunks --- spacy/syntax/iterators.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/syntax/iterators.pyx b/spacy/syntax/iterators.pyx index b8b810d36..653c89d8f 100644 --- a/spacy/syntax/iterators.pyx +++ b/spacy/syntax/iterators.pyx @@ -40,7 +40,7 @@ def german_noun_chunks(doc): for rdep in doc[word.i].rights: if rdep.pos == NOUN and rdep.dep == close_app: rbracket = rdep.i+1 - yield word.l_edge, rbracket, np_label + yield word.left_edge.i, rbracket, np_label CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks} From a06fca9fdf48353aa671a84c090f4d21d53b6ec6 Mon Sep 17 00:00:00 2001 From: Wolfgang Seeker Date: Tue, 3 May 2016 16:58:59 +0200 Subject: [PATCH 5/7] German noun chunk iterator now doesn't return tokens more than once --- spacy/symbols.pxd | 1 - spacy/symbols.pyx | 1 - spacy/syntax/iterators.pyx | 10 +++-- spacy/tests/unit/test_parser.py | 72 +++++++++++++++++++++++++-------- 4 files changed, 62 insertions(+), 22 deletions(-) diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index d577eaf6d..942d8aa9c 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -382,7 +382,6 @@ cpdef enum symbol_t: cc ccomp complm - compound conj csubj csubjpass diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index 0e8dcda13..712bef9a3 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -381,7 +381,6 @@ IDS = { "cc": cc, "ccomp": ccomp, "complm": complm, - "compound": compound, "conj": conj, "csubj": csubj, "csubjpass": csubjpass, diff --git a/spacy/syntax/iterators.pyx b/spacy/syntax/iterators.pyx index b8b810d36..395f772ce 100644 --- a/spacy/syntax/iterators.pyx +++ b/spacy/syntax/iterators.pyx @@ -32,7 +32,9 @@ def german_noun_chunks(doc): np_deps = set(doc.vocab.strings[label] for label in labels) close_app = doc.vocab.strings['nk'] - for word in doc: + i = 0 + while i < len(doc): + word = doc[i] if word.pos == NOUN and word.dep in np_deps: rbracket = word.i+1 # try to extend the span to the right @@ -40,7 +42,9 @@ def german_noun_chunks(doc): for rdep in doc[word.i].rights: if rdep.pos == NOUN and rdep.dep == close_app: rbracket = rdep.i+1 - yield word.l_edge, rbracket, np_label - + yield word.left_edge.i, rbracket, np_label + i = rbracket + continue + i += 1 CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks} diff --git a/spacy/tests/unit/test_parser.py b/spacy/tests/unit/test_parser.py index ba224b9ec..78bfad293 100644 --- a/spacy/tests/unit/test_parser.py +++ b/spacy/tests/unit/test_parser.py @@ -1,10 +1,11 @@ +# -*- coding: utf-8 -*- + from __future__ import unicode_literals import pytest import numpy from spacy.attrs import HEAD, DEP -from spacy.symbols import root, det, compound, nsubjpass, aux, auxpass, punct, nsubj, cc, amod, conj, advmod, attr, prep, pobj @pytest.mark.models @@ -13,6 +14,7 @@ class TestNounChunks: def ex1_en(self, EN): example = EN.tokenizer.tokens_from_list('A base phrase should be recognized .'.split(' ')) EN.tagger.tag_from_strings(example, 'DT NN NN MD VB VBN .'.split(' ')) + det,compound,nsubjpass,aux,auxpass,root,punct = tuple( EN.vocab.strings[l] for l in ['det','compound','nsubjpass','aux','auxpass','root','punct'] ) example.from_array([HEAD, DEP], numpy.asarray( [ @@ -30,6 +32,7 @@ class TestNounChunks: def ex2_en(self, EN): example = EN.tokenizer.tokens_from_list('A base phrase and a good phrase are often the same .'.split(' ')) EN.tagger.tag_from_strings(example, 'DT NN NN CC DT JJ NN VBP RB DT JJ .'.split(' ')) + det,compound,nsubj,cc,amod,conj,root,advmod,attr,punct = tuple( EN.vocab.strings[l] for l in ['det','compound','nsubj','cc','amod','conj','root','advmod','attr','punct'] ) example.from_array([HEAD, DEP], numpy.asarray( [ @@ -52,6 +55,7 @@ class TestNounChunks: def ex3_en(self, EN): example = EN.tokenizer.tokens_from_list('A phrase with another phrase occurs .'.split(' ')) EN.tagger.tag_from_strings(example, 'DT NN IN DT NN VBZ .'.split(' ')) + det,nsubj,prep,pobj,root,punct = tuple( EN.vocab.strings[l] for l in ['det','nsubj','prep','pobj','root','punct'] ) example.from_array([HEAD, DEP], numpy.asarray( [ @@ -65,22 +69,43 @@ class TestNounChunks: ], dtype='int32')) return example - # @pytest.fixture(score="class") - # def ex1_de(self, DE): - # example = EN.tokenizer.tokens_from_list('Eine Tasse steht auf dem Tisch .'.split(' ')) - # EN.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN $.'.split(' ')) - # example.from_array([HEAD, DEP], - # numpy.asarray( - # [ - # [1, det], - # [4, nsubj], - # [-1, prep], - # [1, det], - # [-2, pobj], - # [0, root], - # [-1, punct] - # ], dtype='int32')) - # return example + @pytest.fixture(scope="class") + def ex1_de(self, DE): + example = DE.tokenizer.tokens_from_list('Eine Tasse steht auf dem Tisch .'.split(' ')) + DE.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN $.'.split(' ')) + nk,sb,root,mo,punct = tuple( DE.vocab.strings[l] for l in ['nk','sb','root','mo','punct']) + example.from_array([HEAD, DEP], + numpy.asarray( + [ + [1, nk], + [1, sb], + [0, root], + [-1, mo], + [1, nk], + [-2, nk], + [-3, punct] + ], dtype='int32')) + return example + + @pytest.fixture(scope="class") + def ex2_de(self, DE): + example = DE.tokenizer.tokens_from_list('Die Sängerin singt mit einer Tasse Kaffee Arien .'.split(' ')) + DE.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN NN NN $.'.split(' ')) + nk,sb,root,mo,punct,oa = tuple( DE.vocab.strings[l] for l in ['nk','sb','root','mo','punct','oa']) + example.from_array([HEAD, DEP], + numpy.asarray( + [ + [1, nk], + [1, sb], + [0, root], + [-1, mo], + [1, nk], + [-2, nk], + [-1, nk], + [-5, oa], + [-6, punct] + ], dtype='int32')) + return example def test_en_standard_chunk(self, ex1_en): chunks = list(ex1_en.noun_chunks) @@ -98,3 +123,16 @@ class TestNounChunks: assert len(chunks) == 2 assert chunks[0].string == 'A phrase ' assert chunks[1].string == 'another phrase ' + + def test_de_standard_chunk(self, ex1_de): + chunks = list(ex1_de.noun_chunks) + assert len(chunks) == 2 + assert chunks[0].string == 'Eine Tasse ' + assert chunks[1].string == 'dem Tisch ' + + def test_de_extended_chunk(self, ex2_de): + chunks = list(ex2_de.noun_chunks) + assert len(chunks) == 3 + assert chunks[0].string == 'Die Sängerin ' + assert chunks[1].string == 'einer Tasse Kaffee ' + assert chunks[2].string == 'Arien ' From 5bf2fd1f788ccb6e9691b6d137edfd65120ee4ef Mon Sep 17 00:00:00 2001 From: Wolfgang Seeker Date: Tue, 3 May 2016 17:19:05 +0200 Subject: [PATCH 6/7] make the code less cryptic --- spacy/syntax/iterators.pyx | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/spacy/syntax/iterators.pyx b/spacy/syntax/iterators.pyx index 395f772ce..a02dce0b7 100644 --- a/spacy/syntax/iterators.pyx +++ b/spacy/syntax/iterators.pyx @@ -32,9 +32,10 @@ def german_noun_chunks(doc): np_deps = set(doc.vocab.strings[label] for label in labels) close_app = doc.vocab.strings['nk'] - i = 0 - while i < len(doc): - word = doc[i] + rbracket = 0 + for i, word in enumerate(doc): + if i < rbracket: + continue if word.pos == NOUN and word.dep in np_deps: rbracket = word.i+1 # try to extend the span to the right @@ -42,9 +43,6 @@ def german_noun_chunks(doc): for rdep in doc[word.i].rights: if rdep.pos == NOUN and rdep.dep == close_app: rbracket = rdep.i+1 - yield word.left_edge.i, rbracket, np_label - i = rbracket - continue - i += 1 + yield word.left_edge.i, rbracket, np_label CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks} From e4ea2bea01b991ddde345b2b6026cd063765c1a7 Mon Sep 17 00:00:00 2001 From: Wolfgang Seeker Date: Wed, 4 May 2016 07:40:38 +0200 Subject: [PATCH 7/7] fix whitespace --- spacy/syntax/iterators.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/syntax/iterators.pyx b/spacy/syntax/iterators.pyx index a02dce0b7..516c2e41b 100644 --- a/spacy/syntax/iterators.pyx +++ b/spacy/syntax/iterators.pyx @@ -43,6 +43,7 @@ def german_noun_chunks(doc): for rdep in doc[word.i].rights: if rdep.pos == NOUN and rdep.dep == close_app: rbracket = rdep.i+1 - yield word.left_edge.i, rbracket, np_label + yield word.left_edge.i, rbracket, np_label + CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks}