Merge branch 'master' of ssh://github.com/explosion/spaCy

This commit is contained in:
Matthew Honnibal 2016-11-04 20:03:07 +01:00
commit bd6e24fe0e
7 changed files with 16 additions and 16 deletions

View File

@ -4,7 +4,7 @@
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
__title__ = 'spacy'
__version__ = '1.1.2'
__version__ = '1.1.3'
__summary__ = 'Industrial-strength NLP'
__uri__ = 'https://spacy.io'
__author__ = 'Matthew Honnibal'

View File

@ -35,7 +35,7 @@ cdef class Morphology:
cdef int assign_tag(self, TokenC* token, tag) except -1
cdef int _assign_tag_id(self, TokenC* token, int tag_id) except -1
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
cdef int assign_feature(self, uint64_t* morph, feature, value) except -1

View File

@ -39,9 +39,9 @@ cdef class Morphology:
tag_id = self.reverse_index[self.strings[tag]]
else:
tag_id = self.reverse_index[tag]
self._assign_tag_id(token, tag_id)
self.assign_tag_id(token, tag_id)
cdef int _assign_tag_id(self, TokenC* token, int tag_id) except -1:
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
if tag_id >= self.n_tags:
raise ValueError("Unknown tag ID: %s" % tag_id)
# TODO: It's pretty arbitrary to put this logic here. I guess the justification

View File

@ -196,7 +196,7 @@ cdef class Tagger:
self.model.set_scoresC(eg.c.scores,
eg.c.features, eg.c.nr_feat)
guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
self.vocab.morphology.assign_tag(&tokens.c[i], guess)
self.vocab.morphology.assign_tag_id(&tokens.c[i], guess)
eg.fill_scores(0, eg.c.nr_class)
tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length

View File

@ -107,10 +107,11 @@ cdef class Tokenizer:
return (self.__class__, args, None, None)
cpdef Doc tokens_from_list(self, list strings):
raise NotImplementedError(
"Method deprecated in 1.0.\n"
"Old: tokenizer.tokens_from_list(strings)\n"
"New: Doc(tokenizer.vocab, words=strings)")
return Doc(self.vocab, words=strings)
#raise NotImplementedError(
# "Method deprecated in 1.0.\n"
# "Old: tokenizer.tokens_from_list(strings)\n"
# "New: Doc(tokenizer.vocab, words=strings)")
@cython.boundscheck(False)
def __call__(self, unicode string):

View File

@ -577,8 +577,7 @@ cdef class Doc:
elif attr_id == TAG:
for i in range(length):
if values[i] != 0:
self.vocab.morphology.assign_tag(&tokens[i],
self.vocab.morphology.reverse_index[values[i]])
self.vocab.morphology.assign_tag(&tokens[i], values[i])
elif attr_id == POS:
for i in range(length):
tokens[i].pos = <univ_pos_t>values[i]

View File

@ -43,7 +43,7 @@ p
| #[code token.dep_].
+aside-code("Example").
from spacy.symbols import DET
from spacy.symbols import det
the, dog = nlp(u'the dog')
assert the.dep == det
assert the.dep_ == 'det'
@ -96,14 +96,14 @@ p
print([w.text for w in apples.rights])
# ['on']
assert apples.n_lefts == 2
assert apples.n_rights == 3
assert apples.n_rights == 1
from spacy.symbols import nsubj
doc = nlp(u'Credit and mortgage account holders must submit their requests within 30 days.')
root = [w for w in doc if w.head is w][0]
subject = list(root.lefts)[0]
for descendant in subject.subtree:
assert subject.is_ancestor(descendant)
assert subject.is_ancestor_of(descendant)
from spacy.symbols import nsubj
doc = nlp(u'Credit and mortgage account holders must submit their requests.')
@ -131,7 +131,7 @@ p
p
| Finally, I often find the #[code .left_edge] and #[code right_edge]
| attributes especially useful. They give you the first and right tokens
| attributes especially useful. They give you the first and last token
| of the subtree. This is the easiest way to create a #[code Span] object
| for a syntactic phrase — a useful operation.
@ -160,4 +160,4 @@ p
+code.
nlp = spacy.load('en')
doc1 = nlp(u'Text I do want parsed.')
doc2 = nlp(u'Text I don't want parsed', parser=False)
doc2 = nlp(u"Text I don't want parsed", parse=False)