From 56c4e07a59ae7cd35b00a9de0ee0666938396104 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 27 Aug 2015 08:53:48 +1000 Subject: [PATCH 1/5] Update gazetteer.json --- lang_data/en/gazetteer.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lang_data/en/gazetteer.json b/lang_data/en/gazetteer.json index 1aa6b9514..dce2e1f2a 100644 --- a/lang_data/en/gazetteer.json +++ b/lang_data/en/gazetteer.json @@ -14,8 +14,8 @@ {"orth": "9/11"} ], [ - {"lower": "Septmber"}, - {"lower": "Eleven"} + {"lower": "septmber"}, + {"lower": "eleven"} ], [ {"lower": "september"}, From fd1eeb3102d65504d0267861355d61e24e731086 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 04:13:03 +0200 Subject: [PATCH 2/5] * Add POS attribute support in get_attr --- spacy/tokens/doc.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 7994c97c3..955e9b45f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -12,6 +12,7 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUST from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE from ..parts_of_speech import UNIV_POS_NAMES from ..parts_of_speech cimport CONJ, PUNCT, NOUN +from ..parts_of_speech cimport univ_pos_t from ..lexeme cimport check_flag from ..lexeme cimport get_attr as get_lex_attr from .spans cimport Span @@ -327,6 +328,9 @@ cdef class Doc: elif attr_id == TAG: for i in range(length): tokens[i].tag = values[i] + elif attr_id == POS: + for i in range(length): + tokens[i].pos = values[i] elif attr_id == DEP: for i in range(length): tokens[i].dep = values[i] From 5edac11225b4435daac5776dd52ca105bc1d5233 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 04:15:00 +0200 Subject: [PATCH 3/5] * Wrap self.parse in nogil, and break if an invalid move is predicted. The invalid break is a work-around that papers over likely bugs, but we can't easily break in the nogil block, and otherwise we'll get an infinite loop. Need to set this as an error flag. --- spacy/syntax/parser.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 6282339bd..59b90920c 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -84,8 +84,7 @@ cdef class Parser: cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, self.model.n_feats, self.model.n_feats) - with nogil: - self.parse(stcls, eg.c) + self.parse(stcls, eg.c) tokens.set_parse(stcls._sent) cdef void predict(self, StateClass stcls, ExampleC* eg) nogil: @@ -98,6 +97,8 @@ cdef class Parser: cdef void parse(self, StateClass stcls, ExampleC eg) nogil: while not stcls.is_final(): self.predict(stcls, &eg) + if not eg.is_valid[eg.guess]: + break self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label) self.moves.finalize_state(stcls) From 571b6eda88bb72078b88b9a600455cb8ed3ab622 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 05:40:10 +0200 Subject: [PATCH 4/5] * Upd tests --- tests/parser/test_initial_actions_parse.py | 5 ++++- tests/test_matcher.py | 12 ++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/tests/parser/test_initial_actions_parse.py b/tests/parser/test_initial_actions_parse.py index c1603cd93..9f570d8be 100644 --- a/tests/parser/test_initial_actions_parse.py +++ b/tests/parser/test_initial_actions_parse.py @@ -4,7 +4,10 @@ import pytest def test_initial(EN): doc = EN.tokenizer(u'I ate the pizza with anchovies.') EN.tagger(doc) - next_actions = EN.parser.partial(doc, ['L-nsubj', 'S', 'L-det']) + with EN.parser.step_through(doc) as stepwise: + stepwise.transition('L-nsubj') + stepwise.transition('S') + stepwise.transition('L-det') assert doc[0].head.i == 1 assert doc[1].head.i == 1 assert doc[2].head.i == 3 diff --git a/tests/test_matcher.py b/tests/test_matcher.py index 0014e1110..1b748cb53 100644 --- a/tests/test_matcher.py +++ b/tests/test_matcher.py @@ -3,7 +3,7 @@ import pytest from spacy.strings import StringStore from spacy.matcher import * -from spacy.attrs import ORTH +from spacy.attrs import LOWER from spacy.tokens.doc import Doc from spacy.vocab import Vocab @@ -13,7 +13,7 @@ def matcher(EN): patterns = { 'Javascript': ['PRODUCT', {}, [[{'ORTH': 'JavaScript'}]]], 'GoogleNow': ['PRODUCT', {}, [[{'ORTH': 'Google'}, {'ORTH': 'Now'}]]], - 'Java': ['PRODUCT', {}, [[{'ORTH': 'Java'}]]], + 'Java': ['PRODUCT', {}, [[{'LOWER': 'java'}]]], } return Matcher(EN.vocab, patterns) @@ -33,7 +33,7 @@ def test_match_start(matcher, EN): def test_match_end(matcher, EN): - tokens = EN('I like Java') + tokens = EN('I like java') assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 2, 3)] @@ -43,17 +43,17 @@ def test_match_middle(matcher, EN): def test_match_multi(matcher, EN): - tokens = EN('I like Google Now and Java best') + tokens = EN('I like Google Now and java best') assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 2, 4), (EN.vocab.strings['PRODUCT'], 5, 6)] def test_match_preserved(matcher, EN): - doc = EN.tokenizer('I like Java') + doc = EN.tokenizer('I like java') EN.tagger(doc) EN.entity(doc) assert len(doc.ents) == 0 - doc = EN.tokenizer('I like Java') + doc = EN.tokenizer('I like java') matcher(doc) assert len(doc.ents) == 1 EN.tagger(doc) From 7e4fea67d39dd85b9aeed396a055cdd7e4e31971 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 10:48:36 +0200 Subject: [PATCH 5/5] * Fix bug in token subtree, introduced by duplication of L/R code in Stateclass. Need to consolidate the two methods. --- spacy/tokens/token.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index f1f2696cb..cc50fdd08 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -142,7 +142,7 @@ cdef class Token: """The leftward immediate children of the word, in the syntactic dependency parse. """ - cdef const TokenC* ptr = self.c - self.i + cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge) while ptr < self.c: # If this head is still to the right of us, we can skip to it # No token that's between this token and this head could be our @@ -160,7 +160,7 @@ cdef class Token: def __get__(self): """The rightward immediate children of the word, in the syntactic dependency parse.""" - cdef const TokenC* ptr = (self.c - self.i) + (self.array_len - 1) + cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i) tokens = [] while ptr > self.c: # If this head is still to the right of us, we can skip to it