diff --git a/lang_data/en/gazetteer.json b/lang_data/en/gazetteer.json index 1aa6b9514..dce2e1f2a 100644 --- a/lang_data/en/gazetteer.json +++ b/lang_data/en/gazetteer.json @@ -14,8 +14,8 @@ {"orth": "9/11"} ], [ - {"lower": "Septmber"}, - {"lower": "Eleven"} + {"lower": "septmber"}, + {"lower": "eleven"} ], [ {"lower": "september"}, diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 6282339bd..59b90920c 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -84,8 +84,7 @@ cdef class Parser: cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, self.model.n_feats, self.model.n_feats) - with nogil: - self.parse(stcls, eg.c) + self.parse(stcls, eg.c) tokens.set_parse(stcls._sent) cdef void predict(self, StateClass stcls, ExampleC* eg) nogil: @@ -98,6 +97,8 @@ cdef class Parser: cdef void parse(self, StateClass stcls, ExampleC eg) nogil: while not stcls.is_final(): self.predict(stcls, &eg) + if not eg.is_valid[eg.guess]: + break self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label) self.moves.finalize_state(stcls) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 7994c97c3..955e9b45f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -12,6 +12,7 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUST from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE from ..parts_of_speech import UNIV_POS_NAMES from ..parts_of_speech cimport CONJ, PUNCT, NOUN +from ..parts_of_speech cimport univ_pos_t from ..lexeme cimport check_flag from ..lexeme cimport get_attr as get_lex_attr from .spans cimport Span @@ -327,6 +328,9 @@ cdef class Doc: elif attr_id == TAG: for i in range(length): tokens[i].tag = values[i] + elif attr_id == POS: + for i in range(length): + tokens[i].pos = values[i] elif attr_id == DEP: for i in range(length): tokens[i].dep = values[i] diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index f1f2696cb..cc50fdd08 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -142,7 +142,7 @@ cdef class Token: """The leftward immediate children of the word, in the syntactic dependency parse. """ - cdef const TokenC* ptr = self.c - self.i + cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge) while ptr < self.c: # If this head is still to the right of us, we can skip to it # No token that's between this token and this head could be our @@ -160,7 +160,7 @@ cdef class Token: def __get__(self): """The rightward immediate children of the word, in the syntactic dependency parse.""" - cdef const TokenC* ptr = (self.c - self.i) + (self.array_len - 1) + cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i) tokens = [] while ptr > self.c: # If this head is still to the right of us, we can skip to it diff --git a/tests/parser/test_initial_actions_parse.py b/tests/parser/test_initial_actions_parse.py index c1603cd93..9f570d8be 100644 --- a/tests/parser/test_initial_actions_parse.py +++ b/tests/parser/test_initial_actions_parse.py @@ -4,7 +4,10 @@ import pytest def test_initial(EN): doc = EN.tokenizer(u'I ate the pizza with anchovies.') EN.tagger(doc) - next_actions = EN.parser.partial(doc, ['L-nsubj', 'S', 'L-det']) + with EN.parser.step_through(doc) as stepwise: + stepwise.transition('L-nsubj') + stepwise.transition('S') + stepwise.transition('L-det') assert doc[0].head.i == 1 assert doc[1].head.i == 1 assert doc[2].head.i == 3 diff --git a/tests/test_matcher.py b/tests/test_matcher.py index 0014e1110..1b748cb53 100644 --- a/tests/test_matcher.py +++ b/tests/test_matcher.py @@ -3,7 +3,7 @@ import pytest from spacy.strings import StringStore from spacy.matcher import * -from spacy.attrs import ORTH +from spacy.attrs import LOWER from spacy.tokens.doc import Doc from spacy.vocab import Vocab @@ -13,7 +13,7 @@ def matcher(EN): patterns = { 'Javascript': ['PRODUCT', {}, [[{'ORTH': 'JavaScript'}]]], 'GoogleNow': ['PRODUCT', {}, [[{'ORTH': 'Google'}, {'ORTH': 'Now'}]]], - 'Java': ['PRODUCT', {}, [[{'ORTH': 'Java'}]]], + 'Java': ['PRODUCT', {}, [[{'LOWER': 'java'}]]], } return Matcher(EN.vocab, patterns) @@ -33,7 +33,7 @@ def test_match_start(matcher, EN): def test_match_end(matcher, EN): - tokens = EN('I like Java') + tokens = EN('I like java') assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 2, 3)] @@ -43,17 +43,17 @@ def test_match_middle(matcher, EN): def test_match_multi(matcher, EN): - tokens = EN('I like Google Now and Java best') + tokens = EN('I like Google Now and java best') assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 2, 4), (EN.vocab.strings['PRODUCT'], 5, 6)] def test_match_preserved(matcher, EN): - doc = EN.tokenizer('I like Java') + doc = EN.tokenizer('I like java') EN.tagger(doc) EN.entity(doc) assert len(doc.ents) == 0 - doc = EN.tokenizer('I like Java') + doc = EN.tokenizer('I like java') matcher(doc) assert len(doc.ents) == 1 EN.tagger(doc)