Merge branch 'gaz' of ssh://github.com/honnibal/spaCy into gaz

This commit is contained in:
Matthew Honnibal 2015-09-06 18:44:14 +02:00
commit dbf8dce109
6 changed files with 21 additions and 13 deletions

View File

@ -14,8 +14,8 @@
{"orth": "9/11"} {"orth": "9/11"}
], ],
[ [
{"lower": "Septmber"}, {"lower": "septmber"},
{"lower": "Eleven"} {"lower": "eleven"}
], ],
[ [
{"lower": "september"}, {"lower": "september"},

View File

@ -84,8 +84,7 @@ cdef class Parser:
cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE,
self.model.n_feats, self.model.n_feats) self.model.n_feats, self.model.n_feats)
with nogil: self.parse(stcls, eg.c)
self.parse(stcls, eg.c)
tokens.set_parse(stcls._sent) tokens.set_parse(stcls._sent)
cdef void predict(self, StateClass stcls, ExampleC* eg) nogil: cdef void predict(self, StateClass stcls, ExampleC* eg) nogil:
@ -98,6 +97,8 @@ cdef class Parser:
cdef void parse(self, StateClass stcls, ExampleC eg) nogil: cdef void parse(self, StateClass stcls, ExampleC eg) nogil:
while not stcls.is_final(): while not stcls.is_final():
self.predict(stcls, &eg) self.predict(stcls, &eg)
if not eg.is_valid[eg.guess]:
break
self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label) self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label)
self.moves.finalize_state(stcls) self.moves.finalize_state(stcls)

View File

@ -12,6 +12,7 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUST
from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
from ..parts_of_speech import UNIV_POS_NAMES from ..parts_of_speech import UNIV_POS_NAMES
from ..parts_of_speech cimport CONJ, PUNCT, NOUN from ..parts_of_speech cimport CONJ, PUNCT, NOUN
from ..parts_of_speech cimport univ_pos_t
from ..lexeme cimport check_flag from ..lexeme cimport check_flag
from ..lexeme cimport get_attr as get_lex_attr from ..lexeme cimport get_attr as get_lex_attr
from .spans cimport Span from .spans cimport Span
@ -327,6 +328,9 @@ cdef class Doc:
elif attr_id == TAG: elif attr_id == TAG:
for i in range(length): for i in range(length):
tokens[i].tag = values[i] tokens[i].tag = values[i]
elif attr_id == POS:
for i in range(length):
tokens[i].pos = <univ_pos_t>values[i]
elif attr_id == DEP: elif attr_id == DEP:
for i in range(length): for i in range(length):
tokens[i].dep = values[i] tokens[i].dep = values[i]

View File

@ -142,7 +142,7 @@ cdef class Token:
"""The leftward immediate children of the word, in the syntactic """The leftward immediate children of the word, in the syntactic
dependency parse. dependency parse.
""" """
cdef const TokenC* ptr = self.c - self.i cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge)
while ptr < self.c: while ptr < self.c:
# If this head is still to the right of us, we can skip to it # If this head is still to the right of us, we can skip to it
# No token that's between this token and this head could be our # No token that's between this token and this head could be our
@ -160,7 +160,7 @@ cdef class Token:
def __get__(self): def __get__(self):
"""The rightward immediate children of the word, in the syntactic """The rightward immediate children of the word, in the syntactic
dependency parse.""" dependency parse."""
cdef const TokenC* ptr = (self.c - self.i) + (self.array_len - 1) cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
tokens = [] tokens = []
while ptr > self.c: while ptr > self.c:
# If this head is still to the right of us, we can skip to it # If this head is still to the right of us, we can skip to it

View File

@ -4,7 +4,10 @@ import pytest
def test_initial(EN): def test_initial(EN):
doc = EN.tokenizer(u'I ate the pizza with anchovies.') doc = EN.tokenizer(u'I ate the pizza with anchovies.')
EN.tagger(doc) EN.tagger(doc)
next_actions = EN.parser.partial(doc, ['L-nsubj', 'S', 'L-det']) with EN.parser.step_through(doc) as stepwise:
stepwise.transition('L-nsubj')
stepwise.transition('S')
stepwise.transition('L-det')
assert doc[0].head.i == 1 assert doc[0].head.i == 1
assert doc[1].head.i == 1 assert doc[1].head.i == 1
assert doc[2].head.i == 3 assert doc[2].head.i == 3

View File

@ -3,7 +3,7 @@ import pytest
from spacy.strings import StringStore from spacy.strings import StringStore
from spacy.matcher import * from spacy.matcher import *
from spacy.attrs import ORTH from spacy.attrs import LOWER
from spacy.tokens.doc import Doc from spacy.tokens.doc import Doc
from spacy.vocab import Vocab from spacy.vocab import Vocab
@ -13,7 +13,7 @@ def matcher(EN):
patterns = { patterns = {
'Javascript': ['PRODUCT', {}, [[{'ORTH': 'JavaScript'}]]], 'Javascript': ['PRODUCT', {}, [[{'ORTH': 'JavaScript'}]]],
'GoogleNow': ['PRODUCT', {}, [[{'ORTH': 'Google'}, {'ORTH': 'Now'}]]], 'GoogleNow': ['PRODUCT', {}, [[{'ORTH': 'Google'}, {'ORTH': 'Now'}]]],
'Java': ['PRODUCT', {}, [[{'ORTH': 'Java'}]]], 'Java': ['PRODUCT', {}, [[{'LOWER': 'java'}]]],
} }
return Matcher(EN.vocab, patterns) return Matcher(EN.vocab, patterns)
@ -33,7 +33,7 @@ def test_match_start(matcher, EN):
def test_match_end(matcher, EN): def test_match_end(matcher, EN):
tokens = EN('I like Java') tokens = EN('I like java')
assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 2, 3)] assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 2, 3)]
@ -43,17 +43,17 @@ def test_match_middle(matcher, EN):
def test_match_multi(matcher, EN): def test_match_multi(matcher, EN):
tokens = EN('I like Google Now and Java best') tokens = EN('I like Google Now and java best')
assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 2, 4), assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 2, 4),
(EN.vocab.strings['PRODUCT'], 5, 6)] (EN.vocab.strings['PRODUCT'], 5, 6)]
def test_match_preserved(matcher, EN): def test_match_preserved(matcher, EN):
doc = EN.tokenizer('I like Java') doc = EN.tokenizer('I like java')
EN.tagger(doc) EN.tagger(doc)
EN.entity(doc) EN.entity(doc)
assert len(doc.ents) == 0 assert len(doc.ents) == 0
doc = EN.tokenizer('I like Java') doc = EN.tokenizer('I like java')
matcher(doc) matcher(doc)
assert len(doc.ents) == 1 assert len(doc.ents) == 1
EN.tagger(doc) EN.tagger(doc)